diff --git a/.gitattributes b/.gitattributes
index 82e33954dca281a89464c0fc6cc634f58a761794..597c33bb0e439485785b31c3a29614404b5156ad 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -2407,3 +2407,4 @@ wandb/wandb/run-20260419_111433-oh7yfg1j/run-oh7yfg1j.wandb filter=lfs diff=lfs
 0422_QwenLatent_13tasks_stateactionprior_50k/videos/pytorch_model/n_action_steps_10_max_episode_steps_720_n_envs_1_gr1_unified/PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_Env/eb75ba54-de01-4932-90a2-36f3cdeaefaa_success0.mp4 filter=lfs diff=lfs merge=lfs -text
 0422_QwenLatent_13tasks_stateactionprior_50k/videos/pytorch_model/n_action_steps_10_max_episode_steps_720_n_envs_1_gr1_unified/PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_Env/fb67ee40-aa3d-4616-b022-a36af1f3d7d0_success1.mp4 filter=lfs diff=lfs merge=lfs -text
 0422_QwenLatent_13tasks_stateactionprior_50k/videos/pytorch_model/n_action_steps_10_max_episode_steps_720_n_envs_1_gr1_unified/PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_Env/fcdb3d2a-4bc1-4a9b-ae8e-7d9900cea828_success1.mp4 filter=lfs diff=lfs merge=lfs -text
+code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
diff --git a/code/__init__.py b/code/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/__pycache__/__init__.cpython-310.pyc b/code/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..666ad4051bce5456714e8bce666b79f1ab77b6e8
Binary files /dev/null and b/code/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/__pycache__/__init__.cpython-311.pyc b/code/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d9c8b0ed79534d2dba76a2641881c74481780d5
Binary files /dev/null and b/code/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/config/deepseeds/deepspeed_zero2.yaml b/code/config/deepseeds/deepspeed_zero2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ccb145ffa7a8995655e497b356ecfec8e416027
--- /dev/null
+++ b/code/config/deepseeds/deepspeed_zero2.yaml
@@ -0,0 +1,9 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_config_file: "./starVLA/config/deepseeds/ds_config.yaml"
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: false
+distributed_type: DEEPSPEED
+num_machines: 1
+num_processes: 8
\ No newline at end of file
diff --git a/code/config/deepseeds/deepspeed_zero3.yaml b/code/config/deepseeds/deepspeed_zero3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c1b7cca733e95616507bca81fe71a4c96530c19d
--- /dev/null
+++ b/code/config/deepseeds/deepspeed_zero3.yaml
@@ -0,0 +1,7 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_config_file: "./starVLA/config/deepseeds/zero3.yaml"
+  deepspeed_multinode_launcher: standard
+  zero3_init_flag: false
+distributed_type: DEEPSPEED
\ No newline at end of file
diff --git a/code/config/deepseeds/ds_config.yaml b/code/config/deepseeds/ds_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2d461f95290fe6574255262ead9e7dcbb81a245e
--- /dev/null
+++ b/code/config/deepseeds/ds_config.yaml
@@ -0,0 +1,23 @@
+{
+    "fp16": {
+        "enabled": false
+    },
+    "bf16": {
+        "enabled": true
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": 1,
+    "zero_optimization": {
+        "stage": 2,
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "cpu_offload": false
+    },
+    "gradient_clipping": 1.0,
+    "steps_per_print": 10
+}
\ No newline at end of file
diff --git a/code/config/deepseeds/zero2.yaml b/code/config/deepseeds/zero2.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4df9d409dfc682961b2e9e970721a75cdd3667f
--- /dev/null
+++ b/code/config/deepseeds/zero2.yaml
@@ -0,0 +1,21 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
\ No newline at end of file
diff --git a/code/config/deepseeds/zero3.yaml b/code/config/deepseeds/zero3.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..90a219f8070df25ee448e7d0f6a850c1963644a6
--- /dev/null
+++ b/code/config/deepseeds/zero3.yaml
@@ -0,0 +1,28 @@
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+    "bf16": {
+        "enabled": "auto"
+    },
+    "train_micro_batch_size_per_gpu": "auto",
+    "train_batch_size": "auto",
+    "gradient_accumulation_steps": "auto",
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 5e8,
+        "stage3_prefetch_bucket_size": 5e8,
+        "stage3_param_persistence_threshold": 1e6,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
\ No newline at end of file
diff --git a/code/config/training/starvla_train_actionmodel_oxe.yaml b/code/config/training/starvla_train_actionmodel_oxe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ca4db3ec78a74b91823e0129bc6fe95626a4089e
--- /dev/null
+++ b/code/config/training/starvla_train_actionmodel_oxe.yaml
@@ -0,0 +1,85 @@
+run_id: vla_jepa_temp
+run_root_dir: ./runs
+seed: 21
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: ActionModelFM
+  action_model:
+    action_size: 37
+    state_size: 74
+    use_state: ${datasets.vla_data.state_use_action_chunk}
+    hidden_size: 1024
+    intermediate_size: 3072
+    dataset_vocab_size: 256
+    num_data_tokens: 32
+    mask_ratio_mode: "uniform_per_traj"
+    mask_ratio_min: 0.25
+    mask_ratio_max: 0.75
+    min_action_len: 5
+    num_encoder_layers: 28
+    num_decoder_layers: 28
+    num_attention_heads: 16
+    num_key_value_heads: 8
+    head_dim: 128
+    max_position_embeddings: 4096
+    max_action_chunk_size: 50
+    rms_norm_eps: 1.0e-6
+    attention_dropout: 0.0
+    # --- Action model loss mode (choose one combination) ---
+    use_masked_action_recon: false   # true = add reconstruction loss for masked-action view (two-view training)
+    qwen3_pretrained_name_or_path: /mnt/data/fangyu/model/Qwen/Qwen3-0.6B
+
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_13tasks
+    require_language: false
+    # action_type: delta_ee
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 256
+    load_all_data_for_training: true
+    load_video: false
+    obs: ["image_0"]
+    image_size: [224,224]
+    video_backend: torchcodec
+    chunk_size: 15
+    # state chunk aligned with action: state shape (L, state_dim) like action (L, action_dim)
+    state_use_action_chunk: true
+
+trainer:
+  epochs: 1000
+  max_train_steps: 5000
+  num_warmup_steps: 1000
+  save_interval: 5000
+  eval_interval: 50
+  learning_rate:
+    base: 1e-04
+  lr_scheduler_type: cosine_with_min_lr
+  scheduler_specific_kwargs:
+    min_lr: 5.0e-07
+  freeze_modules: ''
+  loss_scale:
+    vla: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  # parameters to be determined
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_pi0.yaml b/code/config/training/starvla_train_pi0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ce88c0ea70c89294c6fb85b402a2004210ddca6f
--- /dev/null
+++ b/code/config/training/starvla_train_pi0.yaml
@@ -0,0 +1,104 @@
+# PI0 训练配置 - 使用 unified 37D action 表示
+# action/state 投影层（原 openpi 硬编码 32D）会在 PI0Framework 初始化时自动替换为 37D，
+# checkpoint 中对应的 32D 参数加载时自动跳过，其余 backbone 参数正常复用。
+
+run_id: pi0_unified_37d
+run_root_dir: ./runs
+seed: 42
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: PI0
+  # PI0 模型配置
+  # action_dim 以本项目为准（统一 37D unified action 表示）。
+  # PI0Pytorch 源码中 action_in_proj / action_out_proj / state_proj 硬编码为 32D，
+  # PI0Framework.__init__ 会调用 _replace_pi0_projection_layers 将其替换为 37D，
+  # 加载 checkpoint 时这些层因 shape 不匹配会自动跳过（保持随机初始化）。
+  # 其余 VLM backbone 层（PaliGemma、action expert transformer 等）仍正常从 checkpoint 加载。
+  pi0:
+    paligemma_variant: "gemma_2b"
+    action_expert_variant: "gemma_300m"
+    pi05: false
+    action_dim: 37          # 项目统一维度；投影层会被自动替换，checkpoint 同维度参数跳过加载
+    state_dim: 74           # unified state 维度；state_proj 替换为 Linear(74, width)，与 action_dim 独立
+    action_horizon: 15      # 与 chunk_size 对齐
+    dtype: "bfloat16"
+
+  # 预训练权重路径（pi05_libero 等，action_dim 不匹配时会 strict=False 部分加载）
+  pi0_checkpoint: /mnt/data/fangyu/model/openpi/openpi-assets/checkpoints/pi0_base_torch/model.pt
+
+  # PaliGemma tokenizer
+  tokenizer_path: /root/.cache/openpi/big_vision/paligemma_tokenizer.model
+
+  # 图像键名，与 openpi 三视角格式对应；gr1 单视角时配合 replicate_single_view
+  image_keys:
+    - "base_0_rgb"
+    - "left_wrist_0_rgb"
+    - "right_wrist_0_rgb"
+
+  # 当 dataset 仅提供 1 张图时复制到 3 视角（如 fourier_gr1 video.ego_view）
+  replicate_single_view: true
+
+  use_state: true
+
+  # 若 true，根据实际图像数量动态使用 image_keys 的前 N 个；否则固定全部 keys，不足补零
+  dynamic_image_keys: false
+
+  num_inference_steps: 10
+
+  # 输出截断维度，null 表示输出完整 action_dim
+  effective_action_dim: null
+
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_simulator
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 32
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224, 224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 15
+    state_use_action_chunk: false
+    num_history_steps: 0
+    include_state: false   # 训练 PI0 时不使用 state
+
+trainer:
+  epochs: 100
+  max_train_steps: 20000
+  num_warmup_steps: 5000
+  num_stable_steps: 0
+  save_interval: 5000
+  max_checkpoints_to_keep: 20
+
+  learning_rate:
+    base: 2.5e-5
+    pi0_model: 2.5e-5
+
+  lr_scheduler_type: warmup_stable_cosine
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001
+
+  freeze_modules: ""
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  is_resume: false
+  pretrained_checkpoint: null
+  enable_gradient_checkpointing: false
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_qwengr00t.yaml b/code/config/training/starvla_train_qwengr00t.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..567a84de5ef4bd65ff9cc8783d690316aad9b1a1
--- /dev/null
+++ b/code/config/training/starvla_train_qwengr00t.yaml
@@ -0,0 +1,99 @@
+run_id: qwengr00t_oxe
+run_root_dir: ./runs
+seed: 42
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: QwenGR00T
+  qwenvl:
+    base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    num_data_tokens: 32  # dataset soft prompt tokens prepended to VLM input (0 = disabled)
+
+  # QwenGR00T required action head config
+  action_model:
+    dataset_vocab_size: 256  # number of distinct dataset IDs for soft prompt embedding
+    action_model_type: DiT-B
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 37
+    state_dim: 74
+    future_action_window_size: 14
+    action_horizon: 15
+    past_action_window_size: 0
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 10
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      cross_attention_dim: 2048
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: "ada_norm"
+      num_layers: 16
+      output_dim: 1024
+      positional_embeddings: null
+
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_13tasks
+    CoT_prompt: "Task: {instruction}. What are the next 15 actions to take?"
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 32
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224, 224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 15
+    state_use_action_chunk: false
+    num_history_steps: 0
+    include_state: true
+
+trainer:
+  epochs: 100
+  max_train_steps: 50000
+  num_warmup_steps: 5000
+  num_stable_steps: 0
+  save_interval: 5000
+  eval_interval: 50
+  max_checkpoints_to_keep: 20
+
+  # Used in QwenGR00T.forward() to repeat diffusion training pairs
+  repeated_diffusion_steps: 1
+
+  learning_rate:
+    base: 5e-05
+    qwen_vl_interface: 5e-05
+    action_model: 5e-05
+  lr_scheduler_type: warmup_stable_cosine
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001`
+
+  freeze_modules: ''
+  warmup_ratio: 0.1
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 4
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_qwenlatent_history_naive_oxe.yaml b/code/config/training/starvla_train_qwenlatent_history_naive_oxe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..30e55e36d0f16a649cabe7e23be6d61acc0ab36c
--- /dev/null
+++ b/code/config/training/starvla_train_qwenlatent_history_naive_oxe.yaml
@@ -0,0 +1,106 @@
+run_id: vla_jepa_temp
+run_root_dir: ./runs
+seed: 42
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  # Naive baseline: history tokens are projected directly via two-layer MLPs
+  # (history_action_projector + history_state_projector) without any action
+  # encoder.  Directly comparable to QwenLatent_history which uses the full
+  # action-encoder path for history encoding.
+  name: QwenLatent_history_naive
+  qwenvl:
+    base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    num_data_tokens: 32
+  action_model:
+    ckpt_path: /mnt/data/fangyu/code/reward_new/runs/0417_Action_9tasks_actionstate_fixchunk15/final_model/pytorch_model.pt
+    # ckpt_path: null
+    action_size: 37
+    state_size: 74  # 与 action model 一致；0 表示不使用 state
+    use_state: ${datasets.vla_data.state_use_action_chunk}
+    hidden_size: 1024
+    intermediate_size: 3072
+    dataset_vocab_size: 256
+    num_data_tokens: 32
+    min_action_len: 5
+    num_encoder_layers: 28
+    num_decoder_layers: 28
+    num_attention_heads: 16
+    num_key_value_heads: 8
+    head_dim: 128
+    max_position_embeddings: 2048
+    max_action_chunk_size: 50
+    rms_norm_eps: 1.0e-6
+    attention_dropout: 0.0
+    use_vae_reparameterization: false
+    use_ema: false  # 是否使用 EMA；若为 false，则冻结 encoder，只训练 VLM 和 decoder
+    chunk_size: ${datasets.vla_data.chunk_size}
+    loss_mode: full # full, predict_only
+    qwen3_pretrained_name_or_path: /mnt/data/fangyu/model/Qwen/Qwen3-0.6B
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_simulator # bridge_rt_1
+    # action_type: delta_ee
+    CoT_prompt: "Task: {instruction}. What are the next 15 actions to take?"
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 32
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224,224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 30
+    # state chunk 与 action chunk 对齐（与 action model 训练一致）
+    state_use_action_chunk: true
+    # 历史 state/action 步数；>0 时每个 sample 会多返回 state_history、action_history
+    num_history_steps: 15
+    include_state: ${datasets.vla_data.state_use_action_chunk}
+
+trainer:
+  epochs: 100
+  max_train_steps: 30000
+  num_warmup_steps: 3000
+  num_stable_steps: 0  # 保持 max_lr 的步数（在 warmup 之后）
+  mode: freeze_action_encoder_decay_aux_loss  # freeze_action_encoder_decay_aux_loss
+  loss_weights_decay_steps: 5000
+
+  save_interval: 5000
+  eval_interval: 50
+  max_checkpoints_to_keep: 10  # 最多保留的checkpoint数量，超过则删除最旧的
+  learning_rate:
+    base: 2.5e-05
+    qwen_vl_interface: 2.5e-05
+    action_model: 2.5e-05
+  lr_scheduler_type: warmup_stable_cosine  # options: warmup_stable_cosine (default), onecycle
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001  # 最终 lr = base_lr * min_lr_ratio
+  freeze_modules: ''
+  loss_scale:
+    align_loss: 1.0
+    recon_loss: 1.0
+    predict_loss: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  # parameters to be determined
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_qwenlatent_history_oxe.yaml b/code/config/training/starvla_train_qwenlatent_history_oxe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..13b338d6af1248941394de94eedb1b5de0ccfa86
--- /dev/null
+++ b/code/config/training/starvla_train_qwenlatent_history_oxe.yaml
@@ -0,0 +1,102 @@
+run_id: vla_jepa_temp
+run_root_dir: ./runs
+seed: 42
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: QwenLatent_history
+  qwenvl:
+    base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    num_data_tokens: 32
+  action_model:
+    ckpt_path: /mnt/data/fangyu/code/reward_new/runs/0418_Action_13tasks_actionstate_fixchunk15/final_model/pytorch_model.pt
+    # ckpt_path: null
+    action_size: 37
+    state_size: 74  # 与 action model 一致；0 表示不使用 state
+    use_state: ${datasets.vla_data.state_use_action_chunk}
+    hidden_size: 1024
+    intermediate_size: 3072
+    dataset_vocab_size: 256
+    num_data_tokens: 32
+    min_action_len: 5
+    num_encoder_layers: 28
+    num_decoder_layers: 28
+    num_attention_heads: 16
+    num_key_value_heads: 8
+    head_dim: 128
+    max_position_embeddings: 2048
+    max_action_chunk_size: 50
+    rms_norm_eps: 1.0e-6
+    attention_dropout: 0.0
+    use_vae_reparameterization: false
+    use_ema: false  # 是否使用 EMA；若为 false，则冻结 encoder，只训练 VLM 和 decoder
+    chunk_size: ${datasets.vla_data.chunk_size}
+    loss_mode: full # full, predict_only
+    qwen3_pretrained_name_or_path: /mnt/data/fangyu/model/Qwen/Qwen3-0.6B
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_simulator # bridge_rt_1
+    # action_type: delta_ee
+    CoT_prompt: "Task: {instruction}. What are the next 15 actions to take?"
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 32
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224,224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 30
+    # state chunk 与 action chunk 对齐（与 action model 训练一致）
+    state_use_action_chunk: true
+    # 历史 state/action 步数；>0 时每个 sample 会多返回 state_history、action_history
+    num_history_steps: 15
+    include_state: ${datasets.vla_data.state_use_action_chunk}
+
+trainer:
+  epochs: 100
+  max_train_steps: 50000
+  num_warmup_steps: 5000
+  num_stable_steps: 0  # 保持 max_lr 的步数（在 warmup 之后）
+  mode: freeze_action_encoder_decay_aux_loss  # freeze_action_encoder_decay_aux_loss
+  loss_weights_decay_steps: 5000
+
+  save_interval: 5000
+  eval_interval: 50
+  max_checkpoints_to_keep: 10  # 最多保留的checkpoint数量，超过则删除最旧的
+  learning_rate:
+    base: 3e-05
+    qwen_vl_interface: 3e-05
+    action_model: 3e-05
+  lr_scheduler_type: warmup_stable_cosine  # options: warmup_stable_cosine (default), onecycle
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001  # 最终 lr = base_lr * min_lr_ratio
+  freeze_modules: ''
+  loss_scale:
+    align_loss: 1.0
+    recon_loss: 1.0
+    predict_loss: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  # parameters to be determined
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_qwenlatent_oxe.yaml b/code/config/training/starvla_train_qwenlatent_oxe.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e01e7a9f3a36bdcf93232d08cbd9f0b459d2d38b
--- /dev/null
+++ b/code/config/training/starvla_train_qwenlatent_oxe.yaml
@@ -0,0 +1,103 @@
+run_id: vla_jepa_temp
+run_root_dir: ./runs
+seed: 21
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: QwenLatent
+  qwenvl:
+    base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    num_data_tokens: 32
+  action_model:
+    ckpt_path: /mnt/data/fangyu/code/reward_new/runs/0418_Action_13tasks_actionstate_fixchunk15/final_model/pytorch_model.pt
+    # ckpt_path: null
+    action_size: 37
+    state_size: 74  # 与 action model 一致；0 表示不使用 state
+    use_state: ${datasets.vla_data.state_use_action_chunk}
+    hidden_size: 1024
+    intermediate_size: 3072
+    dataset_vocab_size: 256
+    num_data_tokens: 32
+    num_t_samples: 4
+    min_action_len: 5
+    num_encoder_layers: 28
+    num_decoder_layers: 28
+    num_attention_heads: 16
+    num_key_value_heads: 8
+    head_dim: 128
+    max_position_embeddings: 2048
+    max_action_chunk_size: 50
+    rms_norm_eps: 1.0e-6
+    attention_dropout: 0.0
+    use_vae_reparameterization: false
+    use_ema: false  # 是否使用 EMA；若为 false，则冻结 encoder，只训练 VLM 和 decoder
+    chunk_size: ${datasets.vla_data.chunk_size}
+    loss_mode: full # full, predict_only
+    qwen3_pretrained_name_or_path: /mnt/data/fangyu/model/Qwen/Qwen3-0.6B
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_13tasks # bridge_rt_1
+    # action_type: delta_ee
+    CoT_prompt: "Task: {instruction}. What are the next 15 actions to take?"
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 32
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224,224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 15
+    # state chunk 与 action chunk 对齐（与 action model 训练一致）
+    state_use_action_chunk: true
+    # 历史 state/action 步数；>0 时每个 sample 会多返回 state_history、action_history
+    num_history_steps: 0
+    include_state: ${datasets.vla_data.state_use_action_chunk}
+
+trainer:
+  epochs: 100
+  max_train_steps: 50000
+  num_warmup_steps: 5000
+  num_stable_steps: 0  # 保持 max_lr 的步数（在 warmup 之后）
+  mode: decay_aux_loss  # freeze_action_encoder_decay_aux_loss
+  loss_weights_decay_steps: 5000
+
+  save_interval: 5000
+  eval_interval: 50
+  max_checkpoints_to_keep: 20  # 最多保留的checkpoint数量，超过则删除最旧的
+  learning_rate:
+    base: 5e-05
+    qwen_vl_interface: 5e-05
+    action_model: 5e-05
+  lr_scheduler_type: warmup_stable_cosine  # options: warmup_stable_cosine (default), onecycle
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001  # 最终 lr = base_lr * min_lr_ratio
+  freeze_modules: ''
+  loss_scale:
+    align_loss: 1.0
+    recon_loss: 1.0
+    predict_loss: 1.0
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  # parameters to be determined
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/config/training/starvla_train_qwenpi.yaml b/code/config/training/starvla_train_qwenpi.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..973e067ab78e241cfb6be57f1c43bce663985e8d
--- /dev/null
+++ b/code/config/training/starvla_train_qwenpi.yaml
@@ -0,0 +1,97 @@
+run_id: qwenpi_oxe
+run_root_dir: ./runs
+seed: 42
+trackers: [jsonl, wandb]
+wandb_entity: timsty
+wandb_project: vla_jepa
+is_debug: false
+
+framework:
+  name: QwenPI
+  qwenvl:
+    base_vlm: /mnt/data/fangyu/model/Qwen/Qwen3-VL-2B-Instruct
+    attn_implementation: flash_attention_2
+    vl_hidden_dim: 2048
+    num_data_tokens: 32  # dataset soft prompt tokens prepended to VLM input (0 = disabled)
+
+  # QwenPI required action head config (LayerwiseFlowmatchingActionHead)
+  action_model:
+    dataset_vocab_size: 256  # number of distinct dataset IDs for soft prompt embedding
+    hidden_size: 1024
+    add_pos_embed: true
+    max_seq_len: 1024
+    action_dim: 37
+    state_dim: 74
+    future_action_window_size: 14
+    action_horizon: 15
+    past_action_window_size: 0
+    noise_beta_alpha: 1.5
+    noise_beta_beta: 1.0
+    noise_s: 0.999
+    num_timestep_buckets: 1000
+    num_inference_timesteps: 10
+    num_target_vision_tokens: 32
+    diffusion_model_cfg:
+      dropout: 0.2
+      final_dropout: true
+      interleave_self_attention: true
+      norm_type: "ada_norm"
+      output_dim: 1024
+      positional_embeddings: null
+
+datasets:
+  vla_data:
+    dataset_py: lerobot_datasets
+    data_root_dir: /mnt/data/fangyu/dataset/IPEC-COMMUNITY
+    data_mix: cross_embodiedment_13tasks
+    CoT_prompt: "Task: {instruction}. What are the next 15 actions to take?"
+    default_image_resolution: [3, 224, 224]
+    per_device_batch_size: 64
+    load_all_data_for_training: true
+    obs: ["image_0"]
+    image_size: [224, 224]
+    video_backend: torchcodec
+    load_video: true
+    chunk_size: 15
+    state_use_action_chunk: false
+    num_history_steps: 0
+    include_state: true
+
+trainer:
+  epochs: 100
+  max_train_steps: 50000
+  num_warmup_steps: 5000
+  num_stable_steps: 0
+  save_interval: 5000
+  eval_interval: 50
+  max_checkpoints_to_keep: 20
+
+  # Used in QwenPI.forward() to repeat diffusion training pairs
+  repeated_diffusion_steps: 1
+
+  learning_rate:
+    base: 5e-05
+    qwen_vl_interface: 5e-05
+    action_model: 5e-05
+  lr_scheduler_type: warmup_stable_cosine
+  scheduler_specific_kwargs:
+    min_lr_ratio: 0.001
+
+  freeze_modules: ''
+  warmup_ratio: 0.1
+  weight_decay: 0.0
+  logging_frequency: 10
+  gradient_clipping: 5.0
+  gradient_accumulation_steps: 1
+
+  optimizer:
+    name: AdamW
+    betas: [0.9, 0.95]
+    eps: 1.0e-08
+    weight_decay: 1.0e-08
+
+  is_resume: false
+  resume_epoch: null
+  resume_step: null
+  enable_gradient_checkpointing: true
+  enable_mixed_precision_training: true
diff --git a/code/dataloader/__init__.py b/code/dataloader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1870aebb219b7d28dafd434b4cdd767ebd90f166
--- /dev/null
+++ b/code/dataloader/__init__.py
@@ -0,0 +1,70 @@
+import json
+from accelerate.logging import get_logger
+import numpy as np
+from torch.utils.data import DataLoader
+import torch.distributed as dist
+from pathlib import Path
+from starVLA.dataloader.vlm_datasets import make_vlm_dataloader
+
+logger = get_logger(__name__)
+
+
+def _is_main_process() -> bool:
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+def save_dataset_statistics(dataset_statistics, run_dir):
+    """Saves a `dataset_statistics.json` file."""
+    out_path = run_dir / "dataset_statistics.json"
+    with open(out_path, "w") as f_json:
+        for _, stats in dataset_statistics.items():
+            for k in stats["action"].keys():
+                if isinstance(stats["action"][k], np.ndarray):
+                    stats["action"][k] = stats["action"][k].tolist()
+            if "proprio" in stats:
+                for k in stats["proprio"].keys():
+                    if isinstance(stats["proprio"][k], np.ndarray):
+                        stats["proprio"][k] = stats["proprio"][k].tolist()
+            if "num_trajectories" in stats:
+                if isinstance(stats["num_trajectories"], np.ndarray):
+                    stats["num_trajectories"] = stats["num_trajectories"].item()
+            if "num_transitions" in stats:
+                if isinstance(stats["num_transitions"], np.ndarray):
+                    stats["num_transitions"] = stats["num_transitions"].item()
+        json.dump(dataset_statistics, f_json, indent=2)
+    logger.info(f"Saved dataset statistics file at path {out_path}")
+
+
+
+def build_dataloader(cfg, dataset_py="lerobot_datasets_oxe"): # TODO now here only is get dataset, we need mv dataloader to here
+
+    if dataset_py == "lerobot_datasets":
+        from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+        vla_dataset_cfg = cfg.datasets.vla_data
+
+        vla_dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+        
+        vla_train_dataloader = DataLoader(
+            vla_dataset,
+            batch_size=cfg.datasets.vla_data.per_device_batch_size,
+            collate_fn=collate_fn,
+            num_workers=16,
+            prefetch_factor=20,
+            shuffle=True,
+            persistent_workers=True,     # 保持 worker 存活，避免重启开销
+            pin_memory=True,             # 加速 GPU 传输
+            drop_last=True,              # 丢弃最后不完整的 batch，避免等待
+            timeout=30,                  # 设置超时，避免 worker 阻塞导致长时间等待
+        )
+        if _is_main_process():
+            output_dir = Path(cfg.output_dir)
+            vla_dataset.save_dataset_statistics(output_dir / "dataset_statistics.json")
+        return vla_train_dataloader
+    if dataset_py == "vlm_datasets":
+        vlm_data_module = make_vlm_dataloader(cfg)
+        vlm_train_dataloader = vlm_data_module["train_dataloader"]
+        return vlm_train_dataloader
+
+    raise ValueError(
+        f"Unsupported dataset builder `{dataset_py}`. "
+        "Expected one of: `lerobot_datasets`, `vlm_datasets`."
+    )
diff --git a/code/dataloader/__pycache__/__init__.cpython-310.pyc b/code/dataloader/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fe6363e9f643179befd76ca55687423d12b6755
Binary files /dev/null and b/code/dataloader/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader/__pycache__/__init__.cpython-311.pyc b/code/dataloader/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba328397a494874bfabaef753ce14fe361dee9b5
Binary files /dev/null and b/code/dataloader/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/dataloader/__pycache__/lerobot_datasets.cpython-310.pyc b/code/dataloader/__pycache__/lerobot_datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d5eabd173911b64da3cdc32278aa10705fd624e
Binary files /dev/null and b/code/dataloader/__pycache__/lerobot_datasets.cpython-310.pyc differ
diff --git a/code/dataloader/__pycache__/lerobot_datasets.cpython-311.pyc b/code/dataloader/__pycache__/lerobot_datasets.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61261bc902e737e6e6f26f633f880e8ee4d6c6b2
Binary files /dev/null and b/code/dataloader/__pycache__/lerobot_datasets.cpython-311.pyc differ
diff --git a/code/dataloader/__pycache__/vlm_datasets.cpython-310.pyc b/code/dataloader/__pycache__/vlm_datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9af99a899d1385e6d109471908748e7bdb294e43
Binary files /dev/null and b/code/dataloader/__pycache__/vlm_datasets.cpython-310.pyc differ
diff --git a/code/dataloader/__pycache__/vlm_datasets.cpython-311.pyc b/code/dataloader/__pycache__/vlm_datasets.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1748c26dc9b43618d2c9ac68a9baa8fc7d88562
Binary files /dev/null and b/code/dataloader/__pycache__/vlm_datasets.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/README.md b/code/dataloader/gr00t_lerobot/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/dataloader/gr00t_lerobot/__init__.py b/code/dataloader/gr00t_lerobot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b90b023ae561868dd0b9a758ea561ecf1139210
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..068ca0a3e6497367f78a9186bf6a70eb7f52268c
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c518480a91f6ee1aee9eb012b1cc81c622a9e7a
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c499b555fed97e3a41956e16391be4d03dbc745
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/data_config.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..017f240703418ce614e4f2d85380ae2ea60aa3ce
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91668b00ee19cb121649c3d3232bce1829d76b75
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/__pycache__/datasets.cpython-311.pyc
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:953210fc54145a3fac1026888bb1106cdd6351cb8d4eb9e94161669f67db91d9
+size 105575
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7876723fc75719e26a2ae5add22fef079092f77f
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5af0a0294f04149816a7c5b9135463e7bbf06b5d
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/embodiment_tags.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b138cd14fa236b016d6ae5837e78fb225127a955
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..399d2341ed2c0facf3fe61d05498a86569724ee8
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/mixtures.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e876ba93a5085a3e175340d0c5761c7995f380
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0fa0082cd0e111e638e0d9a6ca2505c4efa3151
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/schema.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-310.pyc b/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bd8b945c6cf9f84e8f432eeba3ed83407921831
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-311.pyc b/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0c97f017a1ae13acc807df2e6d981ee781a32b1
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/__pycache__/video.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/data_config.py b/code/dataloader/gr00t_lerobot/data_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..99433c60385b51fe10a47e2ea1069a39d27335e8
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/data_config.py
@@ -0,0 +1,392 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+from starVLA.dataloader.gr00t_lerobot.datasets import ModalityConfig
+from starVLA.dataloader.gr00t_lerobot.transform.base import ComposedModalityTransform, ModalityTransform
+from starVLA.dataloader.gr00t_lerobot.transform.state_action import (
+    StateActionSinCosTransform,
+    StateActionToTensor,
+    StateActionTransform,
+)
+
+
+class BaseDataConfig(ABC):
+    @abstractmethod
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        pass
+
+    @abstractmethod
+    def transform(self) -> ModalityTransform:
+        pass
+
+
+###########################################################################################
+
+class Libero4in1DataConfig:
+    video_keys = [
+        "video.primary_image",
+        "video.wrist_image",
+    ]
+    
+    state_keys = [
+        "state.x",
+        "state.y",
+        "state.z",
+        "state.roll",
+        "state.pitch",
+        "state.yaw",
+        "state.pad",
+        "state.gripper",
+    ]
+    action_keys = [
+        "action.x",
+        "action.y",
+        "action.z",
+        "action.roll",
+        "action.pitch",
+        "action.yaw",
+        "action.gripper",
+    ]
+    
+    language_keys = ["annotation.human.action.task_description"]
+
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False, num_history_steps: int = 0):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+        self.num_history_steps = int(num_history_steps or 0)
+        self.video_observation_indices = [0] if self.num_history_steps == 0 else [0, self.num_history_steps - 1]
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.video_observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.x": "min_max",
+                    "state.y": "min_max",
+                    "state.z": "min_max",
+                    "state.roll": "min_max",
+                    "state.pitch": "min_max",
+                    "state.yaw": "min_max",
+                    "state.pad": "min_max",
+                    # "state.gripper": "binary",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.x": "min_max",
+                    "action.y": "min_max",
+                    "action.z": "min_max",
+                    "action.roll": "min_max",
+                    "action.pitch": "min_max",
+                    "action.yaw": "min_max",
+                    # "action.gripper": "binary",
+                },
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+###########################################################################################
+
+class RealWorldFrankaDataConfig:
+    """Real-world Panda robot: 7 joints + 1 gripper (8D), single-arm -> right slot [7:15]."""
+    video_keys = [
+        "video.exterior_image_1_left",
+        "video.wrist_image_left",
+    ]
+    state_keys = [
+        "state.joints",
+        "state.gripper",
+    ]
+    action_keys = [
+        "action.joints",
+        "action.gripper",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False, num_history_steps: int = 0):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+        self.num_history_steps = int(num_history_steps or 0)
+        self.video_observation_indices = [0] if self.num_history_steps == 0 else [0, self.num_history_steps - 1]
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.video_observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.joints": "min_max",
+                    # "state.gripper": "binary",
+                },
+            ),
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.joints": "min_max",
+                    # "action.gripper": "binary",
+                },
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class AgilexDataConfig:
+    video_keys = [
+        "video.cam_high",
+        "video.cam_left_wrist",
+        "video.cam_right_wrist",
+    ]
+    state_keys = [
+        "state.left_joints",
+        "state.left_gripper",
+        "state.right_joints",
+        "state.right_gripper",
+    ]
+    action_keys = [
+        "action.left_joints",
+        "action.left_gripper",
+        "action.right_joints",
+        "action.right_gripper",
+    ]
+
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False, num_history_steps: int = 0):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+        self.num_history_steps = int(num_history_steps or 0)
+        self.video_observation_indices = [0] if self.num_history_steps == 0 else [0, self.num_history_steps - 1]
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.video_observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.left_joints": "min_max",
+                    "state.left_gripper": "binary",
+                    "state.right_joints": "min_max",
+                    "state.right_gripper": "binary",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.left_joints": "min_max",
+                    "action.left_gripper": "binary",
+                    "action.right_joints": "min_max",
+                    "action.right_gripper": "binary",
+                },
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class FourierGr1ArmsWaistDataConfig:
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+    ]
+    language_keys = ["annotation.human.coarse_action"]
+    observation_indices = [0]
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False, num_history_steps: int = 0):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+        self.num_history_steps = int(num_history_steps or 0)
+        self.video_observation_indices = [0] if self.num_history_steps == 0 else [0, self.num_history_steps - 1]
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.video_observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionSinCosTransform(apply_to=self.state_keys),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+###########################################################################################
+
+
+def get_robot_type_config_map(
+    chunk_size: int = 15,
+    state_use_action_chunk: bool = True,
+    num_history_steps: int = 0,
+) -> dict[str, BaseDataConfig]:
+    """state_use_action_chunk: when True, state uses action_indices so state has shape (L, state_dim) aligned with action chunk."""
+    return {
+        "libero_franka": Libero4in1DataConfig(
+            chunk_size=chunk_size,
+            state_use_action_chunk=state_use_action_chunk,
+            num_history_steps=num_history_steps,
+        ),
+        "robotwin": AgilexDataConfig(
+            chunk_size=chunk_size,
+            state_use_action_chunk=state_use_action_chunk,
+            num_history_steps=num_history_steps,
+        ),
+        "fourier_gr1_arms_waist": FourierGr1ArmsWaistDataConfig(
+            chunk_size=chunk_size,
+            state_use_action_chunk=state_use_action_chunk,
+            num_history_steps=num_history_steps,
+        ),
+        "real_world_franka": RealWorldFrankaDataConfig(
+            chunk_size=chunk_size,
+            state_use_action_chunk=state_use_action_chunk,
+            num_history_steps=num_history_steps,
+        ),
+    }
diff --git a/code/dataloader/gr00t_lerobot/datasets.py b/code/dataloader/gr00t_lerobot/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..de7275a30abf6d260defb55273b93a2bf8d9dade
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/datasets.py
@@ -0,0 +1,2165 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+In this file, we define 3 types of datasets:
+1. LeRobotSingleDataset: a single dataset for a given embodiment tag
+2. LeRobotMixtureDataset: a mixture of datasets for a given list of embodiment tags
+3. CachedLeRobotSingleDataset: a single dataset for a given embodiment tag,
+                                with caching for the video frames
+
+See `scripts/load_dataset.py` for examples on how to use these datasets.
+"""
+import os
+import hashlib
+import json, torch
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import os, random
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from PIL import Image
+
+from starVLA.dataloader.gr00t_lerobot.video import get_all_frames, get_frames_by_timestamps
+
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import EmbodimentTag, DATASET_NAME_TO_ID
+from starVLA.dataloader.gr00t_lerobot.schema import (
+    DatasetMetadata,
+    DatasetStatisticalValues,
+    LeRobotModalityMetadata,
+    LeRobotStateActionMetadata,
+)
+from starVLA.dataloader.gr00t_lerobot.transform import ComposedModalityTransform
+
+from functools import partial
+from typing import Tuple, List
+import pickle
+
+# LeRobot v2.0 dataset file names 
+LE_ROBOT_MODALITY_FILENAME = "meta/modality.json"
+LE_ROBOT_EPISODE_FILENAME = "meta/episodes.jsonl"
+LE_ROBOT_TASKS_FILENAME = "meta/tasks.jsonl"
+LE_ROBOT_INFO_FILENAME = "meta/info.json"
+LE_ROBOT_STATS_FILENAME = "meta/stats_gr00t.json"
+LE_ROBOT_DATA_FILENAME = "data/*/*.parquet"
+LE_ROBOT_STEPS_FILENAME = "meta/steps.pkl"
+EPSILON = 5e-4
+
+#  LeRobot v3.0 dataset file names 
+LE_ROBOT3_TASKS_FILENAME = "meta/tasks.parquet"
+LE_ROBOT3_EPISODE_FILENAME = "meta/episodes/*/*.parquet"
+
+
+# =============================================================================
+# Unified Representation Layout & Helpers
+# =============================================================================
+
+STANDARD_ACTION_DIM = 37
+#
+# Unified action representation layout (0-based indices, Python slice is [start, stop)):
+# Keep only: libero_franka, gr1, real_world_franka.
+#
+# - 0:7   -> left_arm (7D): xyz, rpy/euler, gripper
+#            Used by: gr1 left_arm
+# - 7:14  -> right_arm (7D): same structure
+#            Used by: libero_franka; gr1 right_arm
+# - 14:20 -> left_hand (6D): gr1 only
+# - 20:26 -> right_hand (6D): gr1 only
+# - 26:29 -> waist (3D): gr1 only
+# - 29:37 -> joints + gripper (8D): real_world_franka only
+#
+# Mapping:
+#   libero_franka (7D)         -> [7:14] (right_arm slot)
+#   gr1 (29D)                  -> [0:29]
+#   real_world_franka (8D)     -> [29:37] (joints + gripper)
+
+ACTION_REPRESENTATION_SLICES = {
+    # Single-arm (7D) -> right_arm slot [7:14]
+    "franka": slice(7, 14),
+
+    # Humanoid (29D) -> full [0:29]
+    "gr1": slice(0, 29),
+
+    # Real-world (8D) -> [29:37] (joints + gripper)
+    "real_world_franka": slice(29, 37),
+}
+
+STANDARD_STATE_DIM = 74
+# Mapping:
+#   libero_franka (8D)      -> [0:8]
+#   real_world_franka (8D)  -> [8:16]
+#   gr1 (58D after sin/cos) -> [16:74]
+
+STATE_REPRESENTATION_SLICES = {
+    # Single-arm (8D)
+    "franka": slice(0, 8),
+    # Real-world (8D)
+    "real_world_franka": slice(8, 16),
+    # GR1 isolated (58D, has StateActionSinCosTransform - different pipeline)
+    "gr1": slice(16, 74),
+}
+
+
+def standardize_action_representation(
+    action: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot action to a fixed-size standard action vector."""
+    target_slice = ACTION_REPRESENTATION_SLICES.get(embodiment_tag)
+
+    # Only allow explicitly configured embodiment tags.
+    if target_slice is None:
+        raise ValueError(
+            f"Unknown embodiment tag '{embodiment_tag}' for action mapping. "
+            f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES)}"
+        )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if action.shape[-1] != expected_dim:
+        raise ValueError(
+            f"Action dim mismatch for tag '{embodiment_tag}': "
+            f"{action.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*action.shape[:-1], STANDARD_ACTION_DIM), dtype=action.dtype
+    )
+    standard[..., target_slice] = action
+    return standard
+
+
+def standardize_state_representation(
+    state: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot state to a fixed-size standard state vector."""
+
+    target_slice = STATE_REPRESENTATION_SLICES.get(embodiment_tag)
+
+    # Only allow explicitly configured embodiment tags.
+    if target_slice is None:
+        raise ValueError(
+            f"Unknown embodiment tag '{embodiment_tag}' for state mapping. "
+            f"Known tags: {sorted(STATE_REPRESENTATION_SLICES)}"
+        )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if state.shape[-1] != expected_dim:
+        raise ValueError(
+            f"State dim mismatch for tag '{embodiment_tag}': "
+            f"{state.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*state.shape[:-1], STANDARD_STATE_DIM), dtype=state.dtype
+    )
+    standard[..., target_slice] = state
+    return standard
+
+
+def calculate_dataset_statistics(parquet_paths: list[Path]) -> dict:
+    """Calculate the dataset statistics of all columns for a list of parquet files."""
+    # Dataset statistics
+    all_low_dim_data_list = []
+    # Collect all the data
+    # parquet_paths = parquet_paths[:3]
+    for parquet_path in tqdm(
+        sorted(list(parquet_paths)),
+        desc="Collecting all parquet files...",
+    ):
+        # Load the parquet file
+        parquet_data = pd.read_parquet(parquet_path)
+        parquet_data = parquet_data
+        all_low_dim_data_list.append(parquet_data)
+    
+    all_low_dim_data = pd.concat(all_low_dim_data_list, axis=0)
+    # Compute dataset statistics
+    dataset_statistics = {}
+    for le_modality in all_low_dim_data.columns:
+        if le_modality.startswith("annotation."):
+            continue
+        print(f"Computing statistics for {le_modality}...")
+        np_data = np.vstack(
+            [np.asarray(x, dtype=np.float32) for x in all_low_dim_data[le_modality]]
+        )
+        dataset_statistics[le_modality] = {
+            "mean": np.mean(np_data, axis=0).tolist(),
+            "std": np.std(np_data, axis=0).tolist(),
+            "min": np.min(np_data, axis=0).tolist(),
+            "max": np.max(np_data, axis=0).tolist(),
+            "q01": np.quantile(np_data, 0.01, axis=0).tolist(),
+            "q99": np.quantile(np_data, 0.99, axis=0).tolist(),
+        }
+    return dataset_statistics
+
+
+class ModalityConfig(BaseModel):
+    """Configuration for a modality."""
+
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+
+
+class LeRobotSingleDataset(Dataset):
+    """
+    Base dataset class for LeRobot that supports sharding.
+    """
+    def __init__(
+        self,
+        dataset_path: Path | str,
+        modality_configs: dict[str, ModalityConfig],
+        embodiment_tag: str | EmbodimentTag,
+        video_backend: str = "decord",
+        video_backend_kwargs: dict | None = None,
+        transforms: ComposedModalityTransform | None = None,
+        delete_pause_frame: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset.
+
+        Args:
+            dataset_path (Path | str): The path to the dataset.
+            modality_configs (dict[str, ModalityConfig]): The configuration for each modality. The keys are the modality names, and the values are the modality configurations.
+                See `ModalityConfig` for more details.
+            video_backend (str): Backend for video reading.
+            video_backend_kwargs (dict): Keyword arguments for the video backend when initializing the video reader.
+            transforms (ComposedModalityTransform): The transforms to apply to the dataset.
+            embodiment_tag (EmbodimentTag): Overload the embodiment tag for the dataset. e.g. define it as "new_embodiment"
+        """
+        # first check if the path directory exists
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset path {dataset_path} does not exist")
+        data_cfg = kwargs.get("data_cfg", {}) or {}
+        # indict letobot version
+        self._lerobot_version = data_cfg.get("lerobot_version", "v2.0") #self._indict_lerobot_version(**kwargs)
+        self.load_video = data_cfg.get("load_video", True)
+        self.num_history_steps = int(data_cfg.get("num_history_steps", 0) or 0)
+
+        self.delete_pause_frame = delete_pause_frame
+
+        # If video loading is disabled, skip video modality end-to-end.
+        if self.load_video:
+            self.modality_configs = modality_configs
+        else:
+            self.modality_configs = {
+                modality: config
+                for modality, config in modality_configs.items()
+                if modality != "video"
+            }
+        self.video_backend = video_backend
+        self.video_backend_kwargs = video_backend_kwargs if video_backend_kwargs is not None else {}
+        self.transforms = (
+            transforms if transforms is not None else ComposedModalityTransform(transforms=[])
+        )
+
+        self._dataset_path = Path(dataset_path)
+        self._dataset_name = self._dataset_path.name
+        self._dataset_id = DATASET_NAME_TO_ID.get(self._dataset_name)
+        if isinstance(embodiment_tag, EmbodimentTag):
+            self.tag = embodiment_tag.value
+        else:
+            self.tag = embodiment_tag
+
+        self._metadata = self._get_metadata(EmbodimentTag(self.tag))
+
+        # LeRobot-specific config
+        self._lerobot_modality_meta = self._get_lerobot_modality_meta()
+        self._lerobot_info_meta = self._get_lerobot_info_meta()
+        self._data_path_pattern = self._get_data_path_pattern()
+        self._video_path_pattern = self._get_video_path_pattern()
+        self._chunk_size = self._get_chunk_size()
+        self._tasks = self._get_tasks()
+        self.curr_traj_data = None
+        self.curr_traj_id = None
+
+        self._trajectory_ids, self._trajectory_lengths = self._get_trajectories()
+        self._modality_keys = self._get_modality_keys()
+        self._delta_indices = self._get_delta_indices()
+        self._all_steps = self._get_all_steps()
+        self.set_transforms_metadata(self.metadata)
+        self.set_epoch(0)
+
+        print(f"Initialized dataset {self.dataset_name} with {embodiment_tag}")
+
+
+        # Check if the dataset is valid
+        self._check_integrity()
+
+    @property
+    def dataset_path(self) -> Path:
+        """The path to the dataset that contains the METADATA_FILENAME file."""
+        return self._dataset_path
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """The metadata for the dataset, loaded from metadata.json in the dataset directory"""
+        return self._metadata
+
+    @property
+    def trajectory_ids(self) -> np.ndarray:
+        """The trajectory IDs in the dataset, stored as a 1D numpy array of strings."""
+        return self._trajectory_ids
+
+    @property
+    def trajectory_lengths(self) -> np.ndarray:
+        """The trajectory lengths in the dataset, stored as a 1D numpy array of integers.
+        The order of the lengths is the same as the order of the trajectory IDs.
+        """
+        return self._trajectory_lengths
+
+    @property
+    def all_steps(self) -> list[tuple[int, int]]:
+        """The trajectory IDs and base indices for all steps in the dataset.
+        Example:
+            self.trajectory_ids: [0, 1, 2]
+            self.trajectory_lengths: [3, 2, 4]
+            return: [
+                ("traj_0", 0), ("traj_0", 1), ("traj_0", 2),
+                ("traj_1", 0), ("traj_1", 1),
+                ("traj_2", 0), ("traj_2", 1), ("traj_2", 2), ("traj_2", 3)
+            ]
+        """
+        return self._all_steps
+
+    @property
+    def modality_keys(self) -> dict:
+        """The modality keys for the dataset. The keys are the modality names, and the values are the keys for each modality.
+
+        Example: {
+            "video": ["video.image_side_0", "video.image_side_1"],
+            "state": ["state.eef_position", "state.eef_rotation"],
+            "action": ["action.eef_position", "action.eef_rotation"],
+            "language": ["language.human.task"],
+            "timestamp": ["timestamp"],
+            "reward": ["reward"],
+        }
+        """
+        return self._modality_keys
+
+    @property
+    def delta_indices(self) -> dict[str, np.ndarray]:
+        """The delta indices for the dataset. The keys are the modality.key, and the values are the delta indices for each modality.key."""
+        return self._delta_indices
+
+    @property
+    def dataset_name(self) -> str:
+        """The name of the dataset."""
+        return self._dataset_name
+
+    @property
+    def lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_modality_meta
+
+    @property
+    def lerobot_info_meta(self) -> dict:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_info_meta
+
+    @property
+    def data_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._data_path_pattern
+
+    @property
+    def video_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._video_path_pattern
+
+    @property
+    def chunk_size(self) -> int:
+        """The chunk size for the LeRobot dataset."""
+        return self._chunk_size
+
+    @property
+    def tasks(self) -> pd.DataFrame:
+        """The tasks for the dataset."""
+        return self._tasks
+
+    def _get_metadata(self, embodiment_tag: EmbodimentTag) -> DatasetMetadata:
+        """Get the metadata for the dataset.
+
+        Returns:
+            dict: The metadata for the dataset.
+        """
+
+        # 1. Modality metadata
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        # 1.1. State and action modalities
+        simplified_modality_meta: dict[str, dict] = {}
+        with open(modality_meta_path, "r") as f:
+            le_modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        for modality in ["state", "action"]:
+            simplified_modality_meta[modality] = {}
+            le_state_action_meta: dict[str, LeRobotStateActionMetadata] = getattr(
+                le_modality_meta, modality
+            )
+            for subkey in le_state_action_meta:
+                state_action_dtype = np.dtype(le_state_action_meta[subkey].dtype)
+                if np.issubdtype(state_action_dtype, np.floating):
+                    continuous = True
+                else:
+                    continuous = False
+                simplified_modality_meta[modality][subkey] = {
+                    "absolute": le_state_action_meta[subkey].absolute,
+                    "rotation_type": le_state_action_meta[subkey].rotation_type,
+                    "shape": [
+                        le_state_action_meta[subkey].end - le_state_action_meta[subkey].start
+                    ],
+                    "continuous": continuous,
+                }
+
+        # 1.2. Video modalities
+        le_info_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        assert (
+            le_info_path.exists()
+        ), f"Please provide a {LE_ROBOT_INFO_FILENAME} file in {self.dataset_path}"
+        with open(le_info_path, "r") as f:
+            le_info = json.load(f)
+        simplified_modality_meta["video"] = {}
+        for new_key in le_modality_meta.video:
+            original_key = le_modality_meta.video[new_key].original_key
+            if original_key is None:
+                original_key = new_key
+            le_video_meta = le_info["features"][original_key]
+            height = le_video_meta["shape"][le_video_meta["names"].index("height")]
+            width = le_video_meta["shape"][le_video_meta["names"].index("width")]
+            # NOTE(FH): different lerobot dataset versions have different keys for the number of channels and fps
+            try:
+                channels = le_video_meta["shape"][le_video_meta["names"].index("channel")]
+                fps = le_video_meta["video_info"]["video.fps"]
+            except (ValueError, KeyError):
+                # channels = le_video_meta["shape"][le_video_meta["names"].index("channels")]
+                channels = le_video_meta["info"]["video.channels"]
+                fps = le_video_meta["info"]["video.fps"]
+            simplified_modality_meta["video"][new_key] = {
+                "resolution": [width, height],
+                "channels": channels,
+                "fps": fps,
+            }
+
+        # 2. Dataset statistics
+        stats_path = self.dataset_path / LE_ROBOT_STATS_FILENAME
+        try:
+            with open(stats_path, "r") as f:
+                le_statistics = json.load(f)
+            for stat in le_statistics.values():
+                DatasetStatisticalValues.model_validate(stat)
+        except (FileNotFoundError, ValidationError) as e:
+            print(f"Failed to load dataset statistics: {e}")
+            print(f"Calculating dataset statistics for {self.dataset_name}")
+            # Get all parquet files in the dataset paths
+            parquet_files = list((self.dataset_path).glob(LE_ROBOT_DATA_FILENAME))
+            parquet_files_filtered = []
+            #  parquet_files[0].name = "episode_033675.parquet" is broken file
+            for pf in parquet_files:
+                if "episode_033675.parquet" in pf.name:
+                    continue
+                parquet_files_filtered.append(pf)
+            
+            le_statistics = calculate_dataset_statistics(parquet_files_filtered)
+            with open(stats_path, "w") as f:
+                json.dump(le_statistics, f, indent=4)
+        dataset_statistics = {}
+        for our_modality in ["state", "action"]:
+            dataset_statistics[our_modality] = {}
+            for subkey in simplified_modality_meta[our_modality]:
+                dataset_statistics[our_modality][subkey] = {}
+                state_action_meta = le_modality_meta.get_key_meta(f"{our_modality}.{subkey}")
+                assert isinstance(state_action_meta, LeRobotStateActionMetadata)
+                le_modality = state_action_meta.original_key
+                for stat_name in le_statistics[le_modality]:
+                    indices = np.arange(
+                        state_action_meta.start,
+                        state_action_meta.end,
+                    )
+                    stat = np.array(le_statistics[le_modality][stat_name])
+                    dataset_statistics[our_modality][subkey][stat_name] = stat[indices].tolist()
+
+        # 3. Full dataset metadata
+        metadata = DatasetMetadata(
+            statistics=dataset_statistics,  # type: ignore
+            modalities=simplified_modality_meta,  # type: ignore
+            embodiment_tag=embodiment_tag,
+        )
+
+        return metadata
+
+    def _get_trajectories(self) -> tuple[np.ndarray, np.ndarray]:
+        """Get the trajectories in the dataset."""
+        # Get trajectory lengths, IDs, and whitelist from dataset metadata
+        # v2.0
+        if self._lerobot_version == "v2.0":
+            file_path = self.dataset_path / LE_ROBOT_EPISODE_FILENAME
+            with open(file_path, "r") as f:
+                episode_metadata = [json.loads(line) for line in f]
+            trajectory_ids = []
+            trajectory_lengths = []
+            for episode in episode_metadata:
+                trajectory_ids.append(episode["episode_index"])
+                trajectory_lengths.append(episode["length"])
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+        # v3.0
+        elif self._lerobot_version == "v3.0":
+            file_paths = list((self.dataset_path).glob(LE_ROBOT3_EPISODE_FILENAME))
+            trajectory_ids = []
+            trajectory_lengths = []
+            # data_chunck_index = []
+            # data_file_index = []
+            # vido_from_index = []
+            self.trajectory_ids_to_metadata = {}
+            for file_path in file_paths:
+                episodes_data = pd.read_parquet(file_path)
+                for index, episode in episodes_data.iterrows():
+                    trajectory_ids.append(episode["episode_index"])
+                    trajectory_lengths.append(episode["length"])
+
+                    # TODO auto map key? just map to file_path and file_from_index
+                    episode_meta = {
+                        "data/chunk_index": episode["data/chunk_index"],
+                        "data/file_index": episode["data/file_index"],
+                        "data/file_from_index": index,
+                    }
+                    if self.load_video:
+                        episode_meta["videos/observation.images.wrist/from_timestamp"] = episode[
+                            "videos/observation.images.wrist/from_timestamp"
+                        ]
+                    self.trajectory_ids_to_metadata[trajectory_ids[-1]] = episode_meta
+
+            # 这里应该可以直接读取到 save index 信息
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+
+    def _get_all_steps(self) -> list[tuple[int, int]]:
+        """Get the trajectory IDs and base indices for all steps in the dataset.
+
+        Returns:
+            list[tuple[str, int]]: A list of (trajectory_id, base_index) tuples.
+        """
+        # Create a hash key based on configuration to ensure cache validity
+        config_key = self._get_steps_config_key()
+        
+        # Create a unique filename based on config_key
+        # steps_filename = f"steps_{config_key}.pkl"
+        # @BUG
+        # fast get static steps @fangjing --> don't use hash to dynamic sample
+        steps_filename =  "steps_data_index.pkl"
+
+
+        steps_path = self.dataset_path / "meta" / steps_filename
+        
+        # Try to load cached steps first
+        try:
+            if steps_path.exists():
+                with open(steps_path, "rb") as f:
+                    cached_data = pickle.load(f)
+                return cached_data["steps"]
+
+        except (FileNotFoundError, pickle.PickleError, KeyError) as e:
+            print(f"Failed to load cached steps: {e}")
+            print("Computing steps from scratch...")
+
+        # Compute steps using single process
+        all_steps = self._get_all_steps_single_process()
+        
+        # Cache the computed steps with unique filename
+        try:
+            cache_data = {
+                "config_key": config_key,
+                "steps": all_steps,
+                "num_trajectories": len(self.trajectory_ids),
+                "total_steps": len(all_steps),
+                "computed_timestamp": pd.Timestamp.now().isoformat(),
+                "delete_pause_frame": self.delete_pause_frame,
+            }
+            
+            # Ensure the meta directory exists
+            steps_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(steps_path, "wb") as f:
+                pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
+            print(f"Cached steps saved to {steps_path}")
+        except Exception as e:
+            print(f"Failed to cache steps: {e}")
+        
+        return all_steps
+
+    def _get_steps_config_key(self) -> str:
+        """Generate a configuration key for steps caching."""
+        config_dict = {
+            "delete_pause_frame": self.delete_pause_frame,
+            "dataset_name": self.dataset_name,
+        }
+        # Create a hash of the configuration
+        config_str = str(sorted(config_dict.items()))
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]  #
+
+
+    def _get_all_steps_single_process(self) -> list[tuple[int, int]]:
+        """Original single-process implementation as fallback."""
+        all_steps: list[tuple[int, int]] = []
+        skipped_trajectories = 0
+        processed_trajectories = 0
+        
+        # Check if language modality is configured
+        has_language_modality = 'language' in self.modality_keys and len(self.modality_keys['language']) > 0
+        # TODO why trajectory_length here, why not use data length?
+        for trajectory_id, trajectory_length in tqdm(zip(self.trajectory_ids, self.trajectory_lengths), total=len(self.trajectory_ids), desc="Getting All Step"):
+            try:
+                if self._lerobot_version == "v2.0":
+                    data = self.get_trajectory_data(trajectory_id)
+                elif self._lerobot_version == "v3.0":
+                    data = self.get_trajectory_data_lerobot_v3(trajectory_id)
+                
+                trajectory_skipped = False
+            
+                # Check if trajectory has valid language instruction (if language modality is configured)
+                if has_language_modality:
+                    self.curr_traj_data = data  # Set current trajectory data for get_language to work
+
+                    language_instruction = self.get_language(trajectory_id, self.modality_keys['language'][0], 0)
+                    if not language_instruction or language_instruction[0] == "":
+                        print(f"Skipping trajectory {trajectory_id} due to empty language instruction")
+                        skipped_trajectories += 1
+                        trajectory_skipped = True
+                        continue
+
+            except Exception as e:
+                print(f"Skipping trajectory {trajectory_id} due to read error: {e}")
+                skipped_trajectories += 1
+                trajectory_skipped = True
+                continue
+        
+            if not trajectory_skipped:
+                processed_trajectories += 1
+        
+            for base_index in range(trajectory_length):
+                all_steps.append((trajectory_id, base_index))
+                
+        # Print summary statistics
+        print(f"Single-process summary: Processed {processed_trajectories} trajectories, skipped {skipped_trajectories} empty trajectories")
+        print(f"Total steps: {len(all_steps)} from {len(self.trajectory_ids)} trajectories")
+                   
+        return all_steps
+
+    def _get_position_and_gripper_values(self, data: pd.DataFrame) -> tuple[list, list]:
+        """Get position and gripper values based on available columns in the dataset."""
+        # Get action keys from modality_keys
+        action_keys = self.modality_keys.get('action', [])
+        
+        # Extract position data
+        delta_position_values = None
+        position_candidates = ['delta_eef_position']
+        coordinate_candidates = ['x', 'y', 'z']
+        
+        # First try combined position fields
+        for pos_key in position_candidates:
+            full_key = f"action.{pos_key}"
+            if full_key in action_keys:
+                try:
+                    # Get the lerobot key for this modality
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    subkey = pos_key
+                    if subkey in le_action_cfg:
+                        le_key = le_action_cfg[subkey].original_key or subkey
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[subkey].start, le_action_cfg[subkey].end)
+                            filtered_data = data_array[:, le_indices]
+                            delta_position_values = filtered_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        # If combined fields not found, try individual x,y,z coordinates
+        if delta_position_values is None:
+            x_data, y_data, z_data = None, None, None
+            for coord in coordinate_candidates:
+                full_key = f"action.{coord}"
+                if full_key in action_keys:
+                    try:
+                        le_action_cfg = self.lerobot_modality_meta.action
+                        if coord in le_action_cfg:
+                            le_key = le_action_cfg[coord].original_key or coord
+                            if le_key in data.columns:
+                                data_array = np.stack(data[le_key])
+                                le_indices = np.arange(le_action_cfg[coord].start, le_action_cfg[coord].end)
+                                coord_data = data_array[:, le_indices].flatten()
+                                if coord == 'x':
+                                    x_data = coord_data
+                                elif coord == 'y':
+                                    y_data = coord_data
+                                elif coord == 'z':
+                                    z_data = coord_data
+                    except Exception:
+                        continue
+            
+            if x_data is not None and y_data is not None and z_data is not None:
+                delta_position_values = np.column_stack((x_data, y_data, z_data)).tolist()
+        
+        if delta_position_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.delta_eef_position' in data.columns:
+                delta_position_values = data['action.delta_eef_position'].to_numpy().tolist()
+            elif all(col in data.columns for col in ['action.x', 'action.y', 'action.z']):
+                x_vals = data['action.x'].to_numpy()
+                y_vals = data['action.y'].to_numpy() 
+                z_vals = data['action.z'].to_numpy()
+                delta_position_values = np.column_stack((x_vals, y_vals, z_vals)).tolist()
+            else:
+                raise ValueError(f"No suitable position columns found. Available columns: {data.columns.tolist()}")
+        
+        # Extract gripper data
+        gripper_values = None
+        gripper_candidates = ['gripper_close', 'gripper']
+        
+        for grip_key in gripper_candidates:
+            full_key = f"action.{grip_key}"
+            if full_key in action_keys:
+                try:
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    if grip_key in le_action_cfg:
+                        le_key = le_action_cfg[grip_key].original_key or grip_key
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[grip_key].start, le_action_cfg[grip_key].end)
+                            gripper_data = data_array[:, le_indices].flatten()
+                            gripper_values = gripper_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        if gripper_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.gripper_close' in data.columns:
+                gripper_values = data['action.gripper_close'].to_numpy().tolist()
+            elif 'action.gripper' in data.columns:
+                gripper_values = data['action.gripper'].to_numpy().tolist()
+            else:
+                raise ValueError(f"No suitable gripper columns found. Available columns: {data.columns.tolist()}")
+        
+        return delta_position_values, gripper_values
+
+    def _get_modality_keys(self) -> dict:
+        """Get the modality keys for the dataset.
+        The keys are the modality names, and the values are the keys for each modality.
+        See property `modality_keys` for the expected format.
+        """
+        modality_keys = defaultdict(list)
+        for modality, config in self.modality_configs.items():
+            modality_keys[modality] = config.modality_keys
+        return modality_keys
+
+    def _get_delta_indices(self) -> dict[str, np.ndarray]:
+        """Restructure the delta indices to use modality.key as keys instead of just the modalities."""
+        delta_indices: dict[str, np.ndarray] = {}
+        for config in self.modality_configs.values():
+            for key in config.modality_keys:
+                delta_indices[key] = np.array(config.delta_indices)
+        return delta_indices
+
+    def _get_lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """Get the metadata for the LeRobot dataset."""
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        with open(modality_meta_path, "r") as f:
+            modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        return modality_meta
+
+    def _get_lerobot_info_meta(self) -> dict:
+        """Get the metadata for the LeRobot dataset."""
+        info_meta_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        with open(info_meta_path, "r") as f:
+            info_meta = json.load(f)
+        return info_meta
+
+    def _get_data_path_pattern(self) -> str:
+        """Get the data path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["data_path"]
+
+    def _get_video_path_pattern(self) -> str:
+        """Get the video path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["video_path"]
+
+    def _get_chunk_size(self) -> int:
+        """Get the chunk size for the LeRobot dataset."""
+        return self.lerobot_info_meta["chunks_size"]
+
+    def _get_tasks(self) -> pd.DataFrame:
+        """Get the tasks for the dataset."""
+        if self._lerobot_version == "v2.0":
+            tasks_path = self.dataset_path / LE_ROBOT_TASKS_FILENAME
+            with open(tasks_path, "r") as f:
+                tasks = [json.loads(line) for line in f]
+            df = pd.DataFrame(tasks)
+            return df.set_index("task_index")
+        
+        elif self._lerobot_version == "v3.0":
+            tasks_path = self.dataset_path / LE_ROBOT3_TASKS_FILENAME
+            df = pd.read_parquet(tasks_path)
+            df = df.reset_index()  # 把索引变成一列，列名通常为 'index'
+            df = df.rename(columns={'index': 'task'})  # 把 'index' 列重命名为 'task'
+            df = df[['task_index', 'task']]  # 调整列顺序
+            return df
+    def _check_integrity(self):
+        """Use the config to check if the keys are valid and detect silent data corruption."""
+        ERROR_MSG_HEADER = f"Error occurred in initializing dataset {self.dataset_name}:\n"
+
+        for modality_config in self.modality_configs.values():
+            for key in modality_config.modality_keys:
+                if key == "lapa_action" or key == "dream_actions":
+                    continue  # no need for any metadata for lapa actions because it comes normalized
+                # Check if the key is valid
+                try:
+                    self.lerobot_modality_meta.get_key_meta(key)
+                except Exception as e:
+                    raise ValueError(
+                        ERROR_MSG_HEADER + f"Unable to find key {key} in modality metadata:\n{e}"
+                    )
+
+    def set_transforms_metadata(self, metadata: DatasetMetadata):
+        """Set the metadata for the transforms. This is useful for transforms that need to know the metadata, such as the normalization values."""
+        self.transforms.set_metadata(metadata)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+
+    def __len__(self) -> int:
+        """Get the total number of data points in the dataset.
+
+        Returns:
+            int: the total number of data points in the dataset.
+        """
+        return len(self.all_steps)
+
+    def __str__(self) -> str:
+        """Get the description of the dataset."""
+        return f"{self.dataset_name} ({len(self)} steps)"
+
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single step in a trajectory.
+
+        Args:
+            index (int): The index of the step to get.
+
+        Returns:
+            dict: The data for the step.
+        """
+        trajectory_id, base_index = self.all_steps[index]
+        data = self.get_step_data(trajectory_id, base_index)
+        
+        # Process all video keys dynamically
+        images = []
+        mid_images = []
+        for video_key in self.modality_keys.get("video", []):
+            video_frames = data[video_key]
+            image = video_frames[0]
+            image = Image.fromarray(image).resize((224, 224))
+            images.append(image)
+            if self.num_history_steps != 0:
+                history_index = min(self.num_history_steps - 1, len(video_frames) - 1)
+                mid_image = video_frames[history_index]
+                mid_image = Image.fromarray(mid_image).resize((224, 224))
+                mid_images.append(mid_image)
+        
+        # Get language and action data
+        language = data[self.modality_keys["language"][0]][0]
+        action = []
+        for action_key in self.modality_keys["action"]:
+            action.append(data[action_key])
+        action = np.concatenate(action, axis=1)
+        action = standardize_action_representation(action, self.tag)
+
+        state = []
+        for state_key in self.modality_keys["state"]:
+            state.append(data[state_key])
+        state = np.concatenate(state, axis=1)
+        state = standardize_state_representation(state, self.tag)
+        
+        sample = dict(action=action, state=state, image=images, language=language, dataset_id=self._dataset_id)
+        if self.num_history_steps != 0:
+            sample["mid_image"] = mid_images
+        return sample
+
+    def get_step_data(self, trajectory_id: int, base_index: int) -> dict:
+        """Get the RAW data for a single step in a trajectory. No transforms are applied.
+
+        Args:
+            trajectory_id (int): The name of the trajectory.
+            base_index (int): The base step index in the trajectory.
+
+        Returns:
+            dict: The RAW data for the step.
+
+        Example return:
+            {
+                "video": {
+                    "video.image_side_0": [B, T, H, W, C],
+                    "video.image_side_1": [B, T, H, W, C],
+                },
+                "state": {
+                    "state.eef_position": [B, T, state_dim],
+                    "state.eef_rotation": [B, T, state_dim],
+                },
+                "action": {
+                    "action.eef_position": [B, T, action_dim],
+                    "action.eef_rotation": [B, T, action_dim],
+                },
+            }
+        """
+        data = {}
+        # Get the data for all modalities # just for action base data
+        self.curr_traj_data = self.get_trajectory_data(trajectory_id)
+        # TODO @JinhuiYE The logic below is poorly implemented. Data reading should be directly based on curr_traj_data.
+        for modality in self.modality_keys:
+            # Get the data corresponding to each key in the modality
+            for key in self.modality_keys[modality]:
+                data[key] = self.get_data_by_modality(trajectory_id, modality, key, base_index)
+        return data
+
+    def get_trajectory_data(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory."""
+        if self._lerobot_version == "v2.0":
+        
+            if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+                return self.curr_traj_data
+            else:
+                chunk_index = self.get_episode_chunk(trajectory_id)
+                parquet_path = self.dataset_path / self.data_path_pattern.format(
+                    episode_chunk=chunk_index, episode_index=trajectory_id
+                )
+                assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+                return pd.read_parquet(parquet_path)
+        elif self._lerobot_version == "v3.0":
+            return self.get_trajectory_data_lerobot_v3(trajectory_id)
+    
+    def get_trajectory_data_lerobot_v3(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory from lerobot v3."""
+        if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+            return self.curr_traj_data
+        else: #TODO check detail later
+            chunk_index = self.get_episode_chunk(trajectory_id)
+
+            file_index = self.get_episode_file_index(trajectory_id)
+            # file_from_index = self.get_episode_file_from_index(trajectory_id)
+            
+            
+            parquet_path = self.dataset_path / self.data_path_pattern.format(
+                chunk_index=chunk_index, file_index=file_index
+            )
+            assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+            file_data = pd.read_parquet(parquet_path)
+            
+            # filter by trajectory_id
+            episode_data = file_data.loc[file_data["episode_index"] == trajectory_id].copy()
+            
+            # fix timestamp from epis index to file index for video alignment
+            if self.load_video:
+                from_timestamp = self.trajectory_ids_to_metadata[trajectory_id].get(
+                    "videos/observation.images.wrist/from_timestamp", 0
+                )
+                episode_data["timestamp"] = episode_data["timestamp"] + from_timestamp
+            
+            return episode_data
+
+
+    def get_trajectory_index(self, trajectory_id: int) -> int:
+        """Get the index of the trajectory in the dataset by the trajectory ID.
+        This is useful when you need to get the trajectory length or sampling weight corresponding to the trajectory ID.
+
+        Args:
+            trajectory_id (str): The ID of the trajectory.
+
+        Returns:
+            int: The index of the trajectory in the dataset.
+        """
+        trajectory_indices = np.where(self.trajectory_ids == trajectory_id)[0]
+        if len(trajectory_indices) != 1:
+            raise ValueError(
+                f"Error finding trajectory index for {trajectory_id}, found {trajectory_indices=}"
+            )
+        return trajectory_indices[0]
+
+    def get_episode_chunk(self, ep_index: int) -> int:
+        """Get the chunk index for an episode index."""
+        return ep_index // self.chunk_size
+    def get_episode_file_index(self, ep_index: int) -> int:
+        """Get the file index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_index"]
+    
+    def get_episode_file_from_index(self, ep_index: int) -> int:
+        """Get the file from index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_from_index"]
+
+
+    def retrieve_data_and_pad(
+        self,
+        array: np.ndarray,
+        step_indices: np.ndarray,
+        max_length: int,
+        padding_strategy: str = "first_last",
+    ) -> np.ndarray:
+        """Retrieve the data from the dataset and pad it if necessary.
+        Args:
+            array (np.ndarray): The array to retrieve the data from.
+            step_indices (np.ndarray): The step indices to retrieve the data for.
+            max_length (int): The maximum length of the data.
+            padding_strategy (str): The padding strategy, either "first" or "last".
+        """
+        # Get the padding indices
+        front_padding_indices = step_indices < 0
+        end_padding_indices = step_indices >= max_length
+        padding_positions = np.logical_or(front_padding_indices, end_padding_indices)
+        # Retrieve the data with the non-padding indices
+        # If there exists some padding, Given T step_indices, the shape of the retrieved data will be (T', ...) where T' < T
+        raw_data = array[step_indices[~padding_positions]]
+        assert isinstance(raw_data, np.ndarray), f"{type(raw_data)=}"
+        # This is the shape of the output, (T, ...)
+        if raw_data.ndim == 1:
+            expected_shape = (len(step_indices),)
+        else:
+            expected_shape = (len(step_indices), *array.shape[1:])
+
+        # Pad the data
+        output = np.zeros(expected_shape)
+        # Assign the non-padded data
+        output[~padding_positions] = raw_data
+        # If there exists some padding, pad the data
+        if padding_positions.any():
+            if padding_strategy == "first_last":
+                # Use first / last step data to pad
+                front_padding_data = array[0]
+                end_padding_data = array[-1]
+                output[front_padding_indices] = front_padding_data
+                output[end_padding_indices] = end_padding_data
+            elif padding_strategy == "zero":
+                # Use zero padding
+                output[padding_positions] = 0
+            else:
+                raise ValueError(f"Invalid padding strategy: {padding_strategy}")
+        return output
+
+    def get_video_path(self, trajectory_id: int, key: str) -> Path:
+        chunk_index = self.get_episode_chunk(trajectory_id)
+        original_key = self.lerobot_modality_meta.video[key].original_key
+        if original_key is None:
+            original_key = key
+        if self._lerobot_version == "v2.0":
+            video_filename = self.video_path_pattern.format(
+                episode_chunk=chunk_index, episode_index=trajectory_id, video_key=original_key
+            )
+        elif self._lerobot_version == "v3.0":
+            episode_meta = self.trajectory_ids_to_metadata[trajectory_id]
+            video_filename = self.video_path_pattern.format(
+                video_key=original_key,
+                chunk_index=episode_meta["data/chunk_index"],
+                file_index=episode_meta["data/file_index"],
+            )
+        return self.dataset_path / video_filename
+
+    def get_video(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the video frames for a trajectory by a base index.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (str): The ID of the trajectory.
+            key (str): The key of the video.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The video frames for the trajectory and frame indices. Shape: (T, H, W, C)
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # print(f"{step_indices=}")
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Ensure the indices are within the valid range
+        # This is equivalent to padding the video with extra frames at the beginning and end
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, self.trajectory_lengths[trajectory_index] - 1)
+        assert key.startswith("video."), f"Video key must start with 'video.', got {key}"
+        # Get the sub-key
+        key = key.replace("video.", "")
+        video_path = self.get_video_path(trajectory_id, key)
+        # Get the action/state timestamps for each frame in the video
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert "timestamp" in self.curr_traj_data.columns, f"No timestamp found in {trajectory_id=}"
+        timestamp: np.ndarray = self.curr_traj_data["timestamp"].to_numpy()
+        # Get the corresponding video timestamps from the step indices
+        video_timestamp = timestamp[step_indices]
+
+        return get_frames_by_timestamps(
+            video_path.as_posix(),
+            video_timestamp,
+            video_backend=self.video_backend, # TODO
+            video_backend_kwargs=self.video_backend_kwargs,
+        )
+
+    def get_state_or_action(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the state or action data for a trajectory by a base index.
+        If the step indices are out of range, pad with the data:
+            if the data is stored in absolute format, pad with the first or last step data;
+            otherwise, pad with zero.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The data for the trajectory and step indices.
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        assert key.startswith(modality + "."), f"{key} must start with {modality + '.'}, got {key}"
+        # Get the sub-key, e.g. state.joint_angles -> joint_angles
+        key = key.replace(modality + ".", "")
+        # Get the lerobot key
+        le_state_or_action_cfg = getattr(self.lerobot_modality_meta, modality)
+        le_key = le_state_or_action_cfg[key].original_key
+        if le_key is None:
+            le_key = key
+        # Get the data array, shape: (T, D)
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert le_key in self.curr_traj_data.columns, f"No {le_key} found in {trajectory_id=}"
+        data_array: np.ndarray = np.stack(self.curr_traj_data[le_key])  # type: ignore
+        assert data_array.ndim == 2, f"Expected 2D array, got key {le_key} is{data_array.shape} array"
+        le_indices = np.arange(
+            le_state_or_action_cfg[key].start,
+            le_state_or_action_cfg[key].end,
+        )
+        data_array = data_array[:, le_indices]
+        # Get the state or action configuration
+        state_or_action_cfg = getattr(self.metadata.modalities, modality)[key]
+
+        # Pad the data
+        return self.retrieve_data_and_pad(
+            array=data_array,
+            step_indices=step_indices,
+            max_length=max_length,
+            padding_strategy="first_last" if state_or_action_cfg.absolute else "zero",
+            # padding_strategy="zero",           # HACK for realdata
+        )
+
+    def get_language(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> list[str]:
+        """Get the language annotation data for a trajectory by step indices.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            key (str): The key of the annotation.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            list[str]: The annotation data for the trajectory and step indices. If no matching data is found, return empty strings.
+        """
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        # Get the end times corresponding to the closest indices
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, max_length - 1)
+        # Get the annotations
+        task_indices: list[int] = []
+        assert key.startswith(
+            "annotation."
+        ), f"Language key must start with 'annotation.', got {key}"
+        subkey = key.replace("annotation.", "")
+        annotation_meta = self.lerobot_modality_meta.annotation
+        assert annotation_meta is not None, f"Annotation metadata is None for {subkey}"
+        assert (
+            subkey in annotation_meta
+        ), f"Annotation key {subkey} not found in metadata, available annotation keys: {annotation_meta.keys()}"
+        subkey_meta = annotation_meta[subkey]
+        original_key = subkey_meta.original_key
+        if original_key is None:
+            original_key = key
+        for i in range(len(step_indices)): # 
+            # task_indices.append(self.curr_traj_data[original_key][step_indices[i]].item())
+            value = self.curr_traj_data[original_key].iloc[step_indices[i]] # TODO check v2.0 
+            task_indices.append(value if isinstance(value, (int, float)) else value.item())
+
+        return self.tasks.loc[task_indices]["task"].tolist()
+
+    def get_data_by_modality(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ):
+        """Get the data corresponding to the modality for a trajectory by a base index.
+        This method will call the corresponding helper method based on the modality.
+        See the helper methods for more details.
+        NOTE: For the language modality, the data is padded with empty strings if no matching data is found.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+        """
+        if modality == "video":
+            return self.get_video(trajectory_id, key, base_index)
+        elif modality == "state" or modality == "action":
+            return self.get_state_or_action(trajectory_id, modality, key, base_index)
+        elif modality == "language":
+            return self.get_language(trajectory_id, key, base_index)
+        else:
+            raise ValueError(f"Invalid modality: {modality}")
+
+    def _save_dataset_statistics_(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the dataset.
+        Key order follows modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Get used modality keys
+        used_action_keys, used_state_keys = get_used_modality_keys(self.modality_keys)
+        
+        # Organize statistics by tag
+        tag = self.tag
+        tag_stats = {}
+        
+        # Process action statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'action') and self.metadata.statistics.action:
+            action_stats = self.metadata.statistics.action
+            filtered_action_stats = {
+                key: action_stats[key]
+                for key in used_action_keys
+                if key in action_stats
+            }
+            
+            if filtered_action_stats:
+                # Combine statistics from filtered action sub-keys
+                combined_action_stats = combine_modality_stats(filtered_action_stats)
+                
+                # Add mask field based on whether it's gripper or not
+                mask = generate_action_mask_for_used_keys(
+                    self.metadata.modalities.action, filtered_action_stats.keys()
+                )
+                combined_action_stats["mask"] = mask
+                
+                tag_stats["action"] = combined_action_stats
+        
+        # Process state statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'state') and self.metadata.statistics.state:
+            state_stats = self.metadata.statistics.state
+            filtered_state_stats = {
+                key: state_stats[key]
+                for key in used_state_keys
+                if key in state_stats
+            }
+            
+            if filtered_state_stats:
+                combined_state_stats = combine_modality_stats(filtered_state_stats)
+                tag_stats["state"] = combined_state_stats
+        
+        # Add dataset counts
+        tag_stats["num_transitions"] = len(self)
+        tag_stats["num_trajectories"] = len(self.trajectory_ids)
+        
+        statistics_data[tag] = tag_stats
+        
+        # Save as JSON file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Single dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(used_action_keys)}")
+        print(f"Used state keys (reordered): {list(used_state_keys)}")
+
+
+
+class MixtureSpecElement(BaseModel):
+    dataset_path: list[Path] | Path = Field(..., description="The path to the dataset.")
+    dataset_weight: float = Field(..., description="The weight of the dataset in the mixture.")
+    distribute_weights: bool = Field(
+        default=False,
+        description="Whether to distribute the weights of the dataset across all the paths. If True, the weights will be evenly distributed across all the paths.",
+    )
+
+
+# Helper functions for dataset statistics
+
+def combine_modality_stats(modality_stats: dict) -> dict:
+    """
+    Combine statistics from all sub-keys under a modality.
+    
+    Args:
+        modality_stats (dict): Statistics for a modality, containing multiple sub-keys.
+                               Each sub-key contains DatasetStatisticalValues object.
+        
+    Returns:
+        dict: Combined statistics
+    """
+    combined_stats = {
+        "mean": [],
+        "std": [],
+        "max": [],
+        "min": [],
+        "q01": [],
+        "q99": []
+    }
+    
+    # Combine statistics in sub-key order
+    for subkey in modality_stats.keys():
+        subkey_stats = modality_stats[subkey]  # This is a DatasetStatisticalValues object
+        
+        # Convert DatasetStatisticalValues to dict-like access
+        for stat_name in ["mean", "std", "max", "min", "q01", "q99"]:
+            stat_value = getattr(subkey_stats, stat_name)
+            if isinstance(stat_value, (list, tuple)):
+                combined_stats[stat_name].extend(stat_value)
+            else:
+                # Handle NDArray case - convert to list
+                if hasattr(stat_value, 'tolist'):
+                    combined_stats[stat_name].extend(stat_value.tolist())
+                else:
+                    combined_stats[stat_name].append(float(stat_value))
+    
+    return combined_stats
+
+def generate_action_mask_for_used_keys(action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+    """
+    Generate mask based on action modalities, but only for used keys.
+    All dimensions are set to True so every channel is de/normalized.
+    
+    Args:
+        action_modalities (dict): Configuration information for action modalities.
+        used_action_keys_ordered: Iterable of actually used action keys in the correct order.
+        
+    Returns:
+        list[bool]: List of mask values
+    """
+    mask = []
+    
+    # Generate mask in the same order as the statistics were combined
+    for subkey in used_action_keys_ordered:
+        if subkey in action_modalities:
+            subkey_config = action_modalities[subkey]
+            
+            # Get dimension count from shape
+            if hasattr(subkey_config, 'shape') and len(subkey_config.shape) > 0:
+                dim_count = subkey_config.shape[0]
+            else:
+                dim_count = 1
+            
+            # Check if it's gripper-related
+            is_gripper = "gripper" in subkey.lower()
+            
+            # Generate mask value for each dimension
+            for _ in range(dim_count):
+                mask.append(not is_gripper)  # gripper is False, others are True
+    
+    return mask
+
+def get_used_modality_keys(modality_keys: dict) -> tuple[set, set]:
+    """Extract used action and state keys from modality configuration."""
+    used_action_keys = []
+    used_state_keys = []
+    
+    # Extract action keys (remove "action." prefix)
+    for action_key in modality_keys.get("action", []):
+        if action_key.startswith("action."):
+            clean_key = action_key.replace("action.", "")
+            used_action_keys.append(clean_key)
+    
+    # Extract state keys (remove "state." prefix)  
+    for state_key in modality_keys.get("state", []):
+        if state_key.startswith("state."):
+            clean_key = state_key.replace("state.", "")
+            used_state_keys.append(clean_key)
+    
+    return used_action_keys, used_state_keys
+
+
+def safe_hash(input_tuple):
+    # keep 128 bits of the hash
+    tuple_string = repr(input_tuple).encode("utf-8")
+    sha256 = hashlib.sha256()
+    sha256.update(tuple_string)
+
+    seed = int(sha256.hexdigest(), 16)
+
+    return seed & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+    
+
+class LeRobotMixtureDataset(Dataset):
+    """
+    A mixture of multiple datasets. This class samples a single dataset based on the dataset weights and then calls the `__getitem__` method of the sampled dataset.
+    It is recommended to modify the single dataset class instead of this class.
+    """
+
+    def __init__(
+        self,
+        data_mixture: Sequence[tuple[LeRobotSingleDataset, float]],
+        mode: str,
+        balance_dataset_weights: bool = True,
+        balance_trajectory_weights: bool = True,
+        seed: int = 42,
+        metadata_config: dict = {
+            "percentile_mixing_method": "min_max",
+        },
+        **kwargs,
+    ):
+        """
+        Initialize the mixture dataset.
+
+        Args:
+            data_mixture (list[tuple[LeRobotSingleDataset, float]]): Datasets and their corresponding weights.
+            mode (str): If "train", __getitem__ will return different samples every epoch; if "val" or "test", __getitem__ will return the same sample every epoch.
+            balance_dataset_weights (bool): If True, the weight of dataset will be multiplied by the total trajectory length of each dataset.
+            balance_trajectory_weights (bool): If True, sample trajectories within a dataset weighted by their length; otherwise, use equal weighting.
+            seed (int): Random seed for sampling.
+        """
+        datasets: list[LeRobotSingleDataset] = []
+        dataset_sampling_weights: list[float] = []
+        for dataset, weight in data_mixture:
+            # Check if dataset is valid and has data
+            if len(dataset) == 0:
+                print(f"Warning: Skipping empty dataset {dataset.dataset_name}")
+                continue
+            datasets.append(dataset)
+            dataset_sampling_weights.append(weight)
+        
+        if len(datasets) == 0:
+            raise ValueError("No valid datasets found in the mixture. All datasets are empty.")
+        
+        self.datasets = datasets
+        self.balance_dataset_weights = balance_dataset_weights
+        self.balance_trajectory_weights = balance_trajectory_weights
+        self.seed = seed
+        self.mode = mode
+
+        # Set properties for sampling
+
+        # 1. Dataset lengths
+        self._dataset_lengths = np.array([len(dataset) for dataset in self.datasets])
+        print(f"Dataset lengths: {self._dataset_lengths}")
+
+        # 2. Dataset sampling weights
+        self._dataset_sampling_weights = np.array(dataset_sampling_weights)
+        
+        if self.balance_dataset_weights:
+            self._dataset_sampling_weights *= self._dataset_lengths
+        
+        # Check for zero or negative weights before normalization
+        if np.any(self._dataset_sampling_weights <= 0):
+            print(f"Warning: Found zero or negative sampling weights: {self._dataset_sampling_weights}")
+            # Set minimum weight to prevent division issues
+            self._dataset_sampling_weights = np.maximum(self._dataset_sampling_weights, 1e-8)
+        
+        # Normalize weights
+        weights_sum = self._dataset_sampling_weights.sum()
+        if weights_sum == 0 or np.isnan(weights_sum):
+            print(f"Error: Invalid weights sum: {weights_sum}")
+            # Fallback to equal weights
+            self._dataset_sampling_weights = np.ones(len(self.datasets)) / len(self.datasets)
+            print(f"Fallback to equal weights")
+        else:
+            self._dataset_sampling_weights /= weights_sum
+
+        # 3. Trajectory sampling weights
+        self._trajectory_sampling_weights: list[np.ndarray] = []
+        for i, dataset in enumerate(self.datasets):
+            trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths))
+            if self.balance_trajectory_weights:
+                trajectory_sampling_weights *= dataset.trajectory_lengths
+            
+            # Check for zero or negative weights before normalization
+            if np.any(trajectory_sampling_weights <= 0):
+                print(f"Warning: Dataset {i} has zero or negative trajectory weights")
+                trajectory_sampling_weights = np.maximum(trajectory_sampling_weights, 1e-8)
+            
+            # Normalize weights
+            weights_sum = trajectory_sampling_weights.sum()
+            if weights_sum == 0 or np.isnan(weights_sum):
+                print(f"Error: Dataset {i} has invalid trajectory weights sum: {weights_sum}")
+                # Fallback to equal weights
+                trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths)) / len(dataset.trajectory_lengths)
+            else:
+                trajectory_sampling_weights /= weights_sum
+            
+            self._trajectory_sampling_weights.append(trajectory_sampling_weights)
+
+        # 4. Primary dataset indices
+        self._primary_dataset_indices = np.array(dataset_sampling_weights) == 1.0
+        if not np.any(self._primary_dataset_indices):
+            print(f"Warning: No dataset with weight 1.0 found. Original weights: {dataset_sampling_weights}")
+            # Fallback: use the dataset(s) with maximum weight as primary
+            max_weight = max(dataset_sampling_weights)
+            self._primary_dataset_indices = np.array(dataset_sampling_weights) == max_weight
+            print(f"Using datasets with maximum weight {max_weight} as primary: {self._primary_dataset_indices}")
+            
+        if not np.any(self._primary_dataset_indices):
+            # This should never happen, but just in case
+            print("Error: Still no primary dataset found. Using first dataset as primary.")
+            self._primary_dataset_indices = np.zeros(len(self.datasets), dtype=bool)
+            self._primary_dataset_indices[0] = True
+
+        # Set the epoch and sample the first epoch
+        self.set_epoch(0)
+
+        self.update_metadata(metadata_config)
+
+    @property
+    def dataset_lengths(self) -> np.ndarray:
+        """The lengths of each dataset."""
+        return self._dataset_lengths
+
+    @property
+    def dataset_sampling_weights(self) -> np.ndarray:
+        """The sampling weights for each dataset."""
+        return self._dataset_sampling_weights
+
+    @property
+    def trajectory_sampling_weights(self) -> list[np.ndarray]:
+        """The sampling weights for each trajectory in each dataset."""
+        return self._trajectory_sampling_weights
+
+    @property
+    def primary_dataset_indices(self) -> np.ndarray:
+        """The indices of the primary datasets."""
+        return self._primary_dataset_indices
+
+    def __str__(self) -> str:
+        dataset_descriptions = []
+        for dataset, weight in zip(self.datasets, self.dataset_sampling_weights):
+            dataset_description = {
+                "Dataset": str(dataset),
+                "Sampling weight": float(weight),
+            }
+            dataset_descriptions.append(dataset_description)
+        return json.dumps({"Mixture dataset": dataset_descriptions}, indent=2)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+        # self.sampled_steps = self.sample_epoch()
+
+    def sample_step(self, index: int) -> tuple[LeRobotSingleDataset, int, int]:
+        """Sample a single step from the dataset."""
+        # return self.sampled_steps[index]
+
+        # Set seed
+        seed = index if self.mode != "train" else safe_hash((self.epoch, index, self.seed))
+        rng = np.random.default_rng(seed)
+
+        # Sample dataset
+        dataset_index = rng.choice(len(self.datasets), p=self.dataset_sampling_weights)
+        dataset = self.datasets[dataset_index]
+
+        # Sample trajectory
+        # trajectory_index = rng.choice(
+        #     len(dataset.trajectory_ids), p=self.trajectory_sampling_weights[dataset_index]
+        # )
+        # trajectory_id = dataset.trajectory_ids[trajectory_index]
+
+        # # Sample step
+        # base_index = rng.choice(dataset.trajectory_lengths[trajectory_index])
+        # return dataset, trajectory_id, base_index
+        single_step_index = rng.choice(len(dataset.all_steps))
+        trajectory_id, base_index = dataset.all_steps[single_step_index]
+        return dataset, trajectory_id, base_index
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single trajectory and start index.
+
+        Args:
+            index (int): The index of the trajectory to get.
+
+        Returns:
+            dict: The data for the trajectory and start index.
+        """
+        max_retries = 10
+        last_exception = None
+        
+        for attempt in range(max_retries):
+            try:
+                dataset, trajectory_name, step = self.sample_step(index)
+                data_raw = dataset.get_step_data(trajectory_name, step)
+                data = dataset.transforms(data_raw)
+                
+                # Process all video keys dynamically
+                images = []
+                mid_images = []
+                num_history_steps = int(getattr(dataset, "num_history_steps", 0) or 0)
+                for video_key in dataset.modality_keys.get("video", []):
+                    video_frames = data[video_key]
+                    image = video_frames[0]
+                    image = Image.fromarray(image).resize((224, 224)) #TODO check if this is ok
+                    images.append(image)
+                    if num_history_steps != 0:
+                        history_index = min(num_history_steps - 1, len(video_frames) - 1)
+                        mid_image = video_frames[history_index]
+                        mid_image = Image.fromarray(mid_image).resize((224, 224))
+                        mid_images.append(mid_image)
+                
+                # Get language and action data
+                language = data[dataset.modality_keys["language"][0]][0]
+                action = []
+                for action_key in dataset.modality_keys["action"]:
+                    action.append(data[action_key])
+                action = np.concatenate(action, axis=1).astype(np.float16)
+                action = standardize_action_representation(action, dataset.tag)
+                
+                state = []
+                for state_key in dataset.modality_keys["state"]:
+                    state.append(data[state_key])
+                state = np.concatenate(state, axis=1).astype(np.float16)
+                state = standardize_state_representation(state, dataset.tag)
+                
+                sample = dict(action=action, state=state, image=images, lang=language, dataset_id=dataset._dataset_id)
+                if num_history_steps != 0:
+                    sample["mid_image"] = mid_images
+                return sample
+                
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    # Log the error but continue trying
+                    print(f"Attempt {attempt + 1}/{max_retries} failed for index {index}: {e}")
+                    print(f"Retrying with new sample...")
+                    # For retry, we can use a slightly different index to get a new sample
+                    # This helps avoid getting stuck on the same problematic sample
+                    index = random.randint(0, len(self) - 1)
+                else:
+                    # All retries exhausted
+                    print(f"All {max_retries} attempts failed for index {index}")
+                    print(f"Last error: {last_exception}")
+                    # Return a dummy sample or re-raise the exception
+                    raise last_exception
+
+    def __len__(self) -> int:
+        """Get the length of a single epoch in the mixture.
+
+        Returns:
+            int: The length of a single epoch in the mixture.
+        """
+        # Check for potential issues
+        if len(self.datasets) == 0:
+            return 0
+            
+        # Check if any dataset lengths are 0 or NaN
+        if np.any(self.dataset_lengths == 0) or np.any(np.isnan(self.dataset_lengths)):
+            print(f"Warning: Found zero or NaN dataset lengths: {self.dataset_lengths}")
+            # Filter out zero/NaN length datasets
+            valid_indices = (self.dataset_lengths > 0) & (~np.isnan(self.dataset_lengths))
+            if not np.any(valid_indices):
+                print("Error: All datasets have zero or NaN length")
+                return 0
+        else:
+            valid_indices = np.ones(len(self.datasets), dtype=bool)
+        
+        # Check if any sampling weights are 0 or NaN
+        if np.any(self.dataset_sampling_weights == 0) or np.any(np.isnan(self.dataset_sampling_weights)):
+            print(f"Warning: Found zero or NaN sampling weights: {self.dataset_sampling_weights}")
+            # Use only valid weights
+            valid_weights = (self.dataset_sampling_weights > 0) & (~np.isnan(self.dataset_sampling_weights))
+            valid_indices = valid_indices & valid_weights
+            if not np.any(valid_indices):
+                print("Error: All sampling weights are zero or NaN")
+                return 0
+        
+        # Check primary dataset indices
+        primary_and_valid = self.primary_dataset_indices & valid_indices
+        if not np.any(primary_and_valid):
+            print(f"Warning: No valid primary datasets found. Primary indices: {self.primary_dataset_indices}, Valid indices: {valid_indices}")
+            # Fallback: use the largest valid dataset
+            if np.any(valid_indices):
+                max_length = self.dataset_lengths[valid_indices].max()
+                print(f"Fallback: Using maximum dataset length: {max_length}")
+                return int(max_length)
+            else:
+                return 0
+        
+        # Calculate the ratio and get max
+        ratios = (self.dataset_lengths / self.dataset_sampling_weights)[primary_and_valid]
+        
+        # Check for NaN or inf in ratios
+        if np.any(np.isnan(ratios)) or np.any(np.isinf(ratios)):
+            print(f"Warning: Found NaN or inf in ratios: {ratios}")
+            print(f"Dataset lengths: {self.dataset_lengths[primary_and_valid]}")
+            print(f"Sampling weights: {self.dataset_sampling_weights[primary_and_valid]}")
+            # Filter out invalid ratios
+            valid_ratios = ratios[~np.isnan(ratios) & ~np.isinf(ratios)]
+            if len(valid_ratios) == 0:
+                print("Error: All ratios are NaN or inf")
+                return 0
+            max_ratio = valid_ratios.max()
+        else:
+            max_ratio = ratios.max()
+        
+        result = int(max_ratio)
+        if result == 0:
+            print(f"Warning: Dataset mixture length is 0")
+        return result
+
+    @staticmethod
+    def compute_overall_statistics(
+        per_task_stats: list[dict[str, dict[str, list[float] | np.ndarray]]],
+        dataset_sampling_weights: list[float] | np.ndarray,
+        percentile_mixing_method: str = "weighted_average",
+    ) -> dict[str, dict[str, list[float]]]:
+        """
+        Computes overall statistics from per-task statistics using dataset sample weights.
+
+        Args:
+            per_task_stats: List of per-task statistics.
+            Example format of one element in the per-task statistics list:
+                {
+                    "state.gripper": {
+                        "min": [...],
+                        "max": [...],
+                        "mean": [...],
+                        "std": [...],
+                        "q01": [...],
+                        "q99": [...],
+                    },
+                    ...
+                }
+            dataset_sampling_weights: List of sample weights for each task.
+            percentile_mixing_method: The method to mix the percentiles, either "weighted_average" or "weighted_std".
+
+        Returns:
+            A dict of overall statistics per modality.
+        """
+        # Normalize the sample weights to sum to 1
+        dataset_sampling_weights = np.array(dataset_sampling_weights)
+        normalized_weights = dataset_sampling_weights / dataset_sampling_weights.sum()
+
+        # Initialize overall statistics dict
+        overall_stats: dict[str, dict[str, list[float]]] = {}
+
+        # Get the list of modality keys
+        modality_keys = per_task_stats[0].keys()
+
+        for modality in modality_keys:
+            # Number of dimensions (assuming consistent across tasks)
+            num_dims = len(per_task_stats[0][modality]["mean"])
+
+            # Initialize accumulators for means and variances
+            weighted_means = np.zeros(num_dims)
+            weighted_squares = np.zeros(num_dims)
+
+            # Collect min, max, q01, q99 from all tasks
+            min_list = []
+            max_list = []
+            q01_list = []
+            q99_list = []
+
+            for task_idx, task_stats in enumerate(per_task_stats):
+                w_i = normalized_weights[task_idx]
+                stats = task_stats[modality]
+                means = np.array(stats["mean"])
+                stds = np.array(stats["std"])
+
+                # Update weighted sums for mean and variance
+                weighted_means += w_i * means
+                weighted_squares += w_i * (stds**2 + means**2)
+
+                # Collect min, max, q01, q99
+                min_list.append(stats["min"])
+                max_list.append(stats["max"])
+                q01_list.append(stats["q01"])
+                q99_list.append(stats["q99"])
+
+            # Compute overall mean
+            overall_mean = weighted_means.tolist()
+
+            # Compute overall variance and std deviation
+            overall_variance = weighted_squares - weighted_means**2
+            overall_std = np.sqrt(overall_variance).tolist()
+
+            # Compute overall min and max per dimension
+            overall_min = np.min(np.array(min_list), axis=0).tolist()
+            overall_max = np.max(np.array(max_list), axis=0).tolist()
+
+            # Compute overall q01 and q99 per dimension
+            # Use weighted average of per-task quantiles
+            q01_array = np.array(q01_list)
+            q99_array = np.array(q99_list)
+            if percentile_mixing_method == "weighted_average":
+                weighted_q01 = np.average(q01_array, axis=0, weights=normalized_weights).tolist()
+                weighted_q99 = np.average(q99_array, axis=0, weights=normalized_weights).tolist()
+                # std_q01 = np.std(q01_array, axis=0).tolist()
+                # std_q99 = np.std(q99_array, axis=0).tolist()
+                # print(modality)
+                # print(f"{std_q01=}, {std_q99=}")
+                # print(f"{weighted_q01=}, {weighted_q99=}")
+            elif percentile_mixing_method == "min_max":
+                weighted_q01 = np.min(q01_array, axis=0).tolist()
+                weighted_q99 = np.max(q99_array, axis=0).tolist()
+            else:
+                raise ValueError(f"Invalid percentile mixing method: {percentile_mixing_method}")
+
+            # Store the overall statistics for the modality
+            overall_stats[modality] = {
+                "min": overall_min,
+                "max": overall_max,
+                "mean": overall_mean,
+                "std": overall_std,
+                "q01": weighted_q01,
+                "q99": weighted_q99,
+            }
+
+        return overall_stats
+
+    @staticmethod
+    def merge_metadata(
+        metadatas: list[DatasetMetadata],
+        dataset_sampling_weights: list[float],
+        percentile_mixing_method: str,
+    ) -> DatasetMetadata:
+        """Merge multiple metadata into one."""
+        # Convert to dicts
+        metadata_dicts = [metadata.model_dump(mode="json") for metadata in metadatas]
+        # Create a new metadata dict
+        merged_metadata = {}
+
+        # Check all metadata have the same embodiment tag
+        assert all(
+            metadata.embodiment_tag == metadatas[0].embodiment_tag for metadata in metadatas
+        ), "All metadata must have the same embodiment tag"
+        merged_metadata["embodiment_tag"] = metadatas[0].embodiment_tag
+
+        # Merge the dataset statistics
+        dataset_statistics = {}
+        dataset_statistics["state"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["state"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        dataset_statistics["action"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["action"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        merged_metadata["statistics"] = dataset_statistics
+
+        # Merge the modality configs
+        modality_configs = defaultdict(set)
+        for metadata in metadata_dicts:
+            for modality, configs in metadata["modalities"].items():
+                modality_configs[modality].add(json.dumps(configs))
+        merged_metadata["modalities"] = {}
+        for modality, configs in modality_configs.items():
+            # Check that all modality configs correspond to the same tag matches
+            assert (
+                len(configs) == 1
+            ), f"Multiple modality configs for modality {modality}: {list(configs)}"
+            merged_metadata["modalities"][modality] = json.loads(configs.pop())
+
+        return DatasetMetadata.model_validate(merged_metadata)
+
+    def update_metadata(self, metadata_config: dict, cached_statistics_path: Path | str | None = None) -> None:
+        """
+        Merge multiple metadatas into one and set the transforms with the merged metadata.
+
+        Args:
+            metadata_config (dict): Configuration for the metadata.
+                "percentile_mixing_method": The method to mix the percentiles, either "weighted_average" or "min_max".
+                    weighted_average: Use the weighted average of the percentiles using the weight used in sampling the datasets.
+                    min_max: Use the min of the 1st percentile and max of the 99th percentile.
+        """
+        # If cached path is provided, try to load and apply
+        if cached_statistics_path is not None:
+            try:
+                cached_stats = self.load_merged_statistics(cached_statistics_path)
+                self.apply_cached_statistics(cached_stats)
+                return
+            except (FileNotFoundError, KeyError, ValidationError) as e:
+                print(f"Failed to load cached statistics: {e}")
+                print("Falling back to computing statistics from scratch...")
+
+        self.tag = EmbodimentTag.NEW_EMBODIMENT.value
+        self.merged_metadata: dict[str, DatasetMetadata] = {}
+        # Group metadata by tag
+        all_metadatas: dict[str, list[DatasetMetadata]] = {}
+        for dataset in self.datasets:
+            if dataset.tag not in all_metadatas:
+                all_metadatas[dataset.tag] = []
+            all_metadatas[dataset.tag].append(dataset.metadata)
+        for tag, metadatas in all_metadatas.items():
+            self.merged_metadata[tag] = self.merge_metadata(
+                metadatas=metadatas,
+                dataset_sampling_weights=self.dataset_sampling_weights.tolist(),
+                percentile_mixing_method=metadata_config["percentile_mixing_method"],
+            )
+        for dataset in self.datasets:
+            dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+
+    def save_dataset_statistics(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save merged dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the datasets.
+        Key order follows each tag's modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Keep key orders per embodiment tag (from modality config order)
+        tag_to_used_action_keys = {}
+        tag_to_used_state_keys = {}
+        for dataset in self.datasets:
+            if dataset.tag in tag_to_used_action_keys:
+                continue
+            used_action_keys, used_state_keys = get_used_modality_keys(dataset.modality_keys)
+            tag_to_used_action_keys[dataset.tag] = used_action_keys
+            tag_to_used_state_keys[dataset.tag] = used_state_keys
+        
+        # Organize statistics by tag
+        for tag, merged_metadata in self.merged_metadata.items():
+            tag_stats = {}
+            
+            # Process action statistics
+            if hasattr(merged_metadata.statistics, 'action') and merged_metadata.statistics.action:
+                action_stats = merged_metadata.statistics.action
+                
+                used_action_keys = tag_to_used_action_keys.get(tag, [])
+                filtered_action_stats = {
+                    key: action_stats[key]
+                    for key in used_action_keys
+                    if key in action_stats
+                }
+                
+                if filtered_action_stats:
+                    combined_action_stats = combine_modality_stats(filtered_action_stats)
+                    
+                    mask = generate_action_mask_for_used_keys(
+                        merged_metadata.modalities.action, filtered_action_stats.keys()
+                    )
+                    combined_action_stats["mask"] = mask
+                    
+                    tag_stats["action"] = combined_action_stats
+            
+            # Process state statistics
+            if hasattr(merged_metadata.statistics, 'state') and merged_metadata.statistics.state:
+                state_stats = merged_metadata.statistics.state
+                
+                used_state_keys = tag_to_used_state_keys.get(tag, [])
+                filtered_state_stats = {
+                    key: state_stats[key]
+                    for key in used_state_keys
+                    if key in state_stats
+                }
+                
+                if filtered_state_stats:
+                    combined_state_stats = combine_modality_stats(filtered_state_stats)
+                    tag_stats["state"] = combined_state_stats
+            
+            # Add dataset counts
+            tag_stats.update(self._get_dataset_counts(tag))
+            
+            statistics_data[tag] = tag_stats
+        
+        # Save file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Merged dataset statistics saved to: {save_path}")
+        print(f"Used action keys by tag: {tag_to_used_action_keys}")
+        print(f"Used state keys by tag: {tag_to_used_state_keys}")
+
+
+    def _combine_modality_stats(self, modality_stats: dict) -> dict:
+        """Backward compatibility wrapper."""
+        return combine_modality_stats(modality_stats)
+
+    def _generate_action_mask_for_used_keys(self, action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+        """Backward compatibility wrapper."""
+        return generate_action_mask_for_used_keys(action_modalities, used_action_keys_ordered)
+
+    def _get_dataset_counts(self, tag: str) -> dict:
+        """
+        Get dataset count information for specified tag.
+        
+        Args:
+            tag (str): embodiment tag
+            
+        Returns:
+            dict: Dictionary containing num_transitions and num_trajectories
+        """
+        num_transitions = 0
+        num_trajectories = 0
+        
+        # Count dataset information belonging to this tag
+        for dataset in self.datasets:
+            if dataset.tag == tag:
+                num_transitions += len(dataset)
+                num_trajectories += len(dataset.trajectory_ids)
+        
+        return {
+            "num_transitions": num_transitions,
+            "num_trajectories": num_trajectories
+        }
+
+    @classmethod
+    def load_merged_statistics(cls, load_path: Path | str) -> dict:
+        """
+        Load merged dataset statistics from file.
+        
+        Args:
+            load_path (Path | str): Path to the statistics file
+            
+        Returns:
+            dict: Dictionary containing merged statistics
+        """
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Statistics file not found: {load_path}")
+        
+        if load_path.suffix.lower() == '.json':
+            with open(load_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        elif load_path.suffix.lower() == '.pkl':
+            import pickle
+            with open(load_path, 'rb') as f:
+                return pickle.load(f)
+        else:
+            raise ValueError(f"Unsupported file format: {load_path.suffix}")
+
+    def apply_cached_statistics(self, cached_statistics: dict) -> None:
+        """
+        Apply cached statistics to avoid recomputation.
+        
+        Args:
+            cached_statistics (dict): Statistics loaded from file
+        """
+        # Validate that cached statistics match current datasets
+        if "metadata" in cached_statistics:
+            cached_dataset_names = set(cached_statistics["metadata"]["dataset_names"])
+            current_dataset_names = set(dataset.dataset_name for dataset in self.datasets)
+            
+            if cached_dataset_names != current_dataset_names:
+                print("Warning: Cached statistics dataset names don't match current datasets.")
+                print(f"Cached: {cached_dataset_names}")
+                print(f"Current: {current_dataset_names}")
+                return
+        
+        # Apply cached statistics
+        self.merged_metadata = {}
+        for tag, stats_data in cached_statistics.items():
+            if tag == "metadata":  # Skip metadata field
+                continue
+                
+            # Convert back to DatasetMetadata format
+            metadata_dict = {
+                "embodiment_tag": tag,
+                "statistics": {
+                    "action": {},
+                    "state": {}
+                },
+                "modalities": {}
+            }
+            
+            # Convert action statistics back
+            if "action" in stats_data:
+                action_data = stats_data["action"]
+                # This is simplified - you may need to split back to sub-keys
+                metadata_dict["statistics"]["action"] = action_data
+            
+            # Convert state statistics back
+            if "state" in stats_data:
+                state_data = stats_data["state"]
+                metadata_dict["statistics"]["state"] = state_data
+            
+            self.merged_metadata[tag] = DatasetMetadata.model_validate(metadata_dict)
+        
+        # Update transforms metadata for each dataset
+        for dataset in self.datasets:
+            if dataset.tag in self.merged_metadata:
+                dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+        
+        print(f"Applied cached statistics for {len(self.merged_metadata)} embodiment tags.")
+
diff --git a/code/dataloader/gr00t_lerobot/datasets_bak.py b/code/dataloader/gr00t_lerobot/datasets_bak.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a603bd28570a0e0adae01329486ddd63aa3996
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/datasets_bak.py
@@ -0,0 +1,2175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+In this file, we define 3 types of datasets:
+1. LeRobotSingleDataset: a single dataset for a given embodiment tag
+2. LeRobotMixtureDataset: a mixture of datasets for a given list of embodiment tags
+3. CachedLeRobotSingleDataset: a single dataset for a given embodiment tag,
+                                with caching for the video frames
+
+See `scripts/load_dataset.py` for examples on how to use these datasets.
+"""
+import os
+import hashlib
+import json, torch
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import os, random
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from PIL import Image
+
+from starVLA.dataloader.gr00t_lerobot.video import get_all_frames, get_frames_by_timestamps
+
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import EmbodimentTag, DATASET_NAME_TO_ID
+from starVLA.dataloader.gr00t_lerobot.schema import (
+    DatasetMetadata,
+    DatasetStatisticalValues,
+    LeRobotModalityMetadata,
+    LeRobotStateActionMetadata,
+)
+from starVLA.dataloader.gr00t_lerobot.transform import ComposedModalityTransform
+
+from functools import partial
+from typing import Tuple, List
+import pickle
+
+# LeRobot v2.0 dataset file names 
+LE_ROBOT_MODALITY_FILENAME = "meta/modality.json"
+LE_ROBOT_EPISODE_FILENAME = "meta/episodes.jsonl"
+LE_ROBOT_TASKS_FILENAME = "meta/tasks.jsonl"
+LE_ROBOT_INFO_FILENAME = "meta/info.json"
+LE_ROBOT_STATS_FILENAME = "meta/stats_gr00t.json"
+LE_ROBOT_DATA_FILENAME = "data/*/*.parquet"
+LE_ROBOT_STEPS_FILENAME = "meta/steps.pkl"
+EPSILON = 5e-4
+
+#  LeRobot v3.0 dataset file names 
+LE_ROBOT3_TASKS_FILENAME = "meta/tasks.parquet"
+LE_ROBOT3_EPISODE_FILENAME = "meta/episodes/*/*.parquet"
+
+
+# =============================================================================
+# Unified Representation Layout & Helpers
+# =============================================================================
+
+STANDARD_ACTION_DIM = 37
+#
+# Unified action representation layout (0-based indices, Python slice is [start, stop)):
+# TIGHT layout: all datasets share the same 29D space for better cross-embodiment transfer.
+#
+# - 0:7   -> left_arm (7D): xyz, rpy/euler, gripper
+#            Used by: robotwin left arm; gr1 left_arm
+# - 7:14  -> right_arm (7D): same structure
+#            Used by: libero, bridge, fractal(rt1), oxe_droid (single-arm -> right slot);
+#                     robotwin right arm; gr1 right_arm
+# - 14:20 -> left_hand (6D): gr1 only
+# - 20:26 -> right_hand (6D): gr1 only
+# - 26:29 -> waist (3D): gr1 only
+# - 29:37 -> joints + gripper (8D): real_world_franka only
+#
+# Mapping:
+#   libero/bridge/fractal/oxe_droid (7D)  -> [7:14] (right_arm slot, single-arm default)
+#   robotwin (14D, left+right)             -> [0:14]
+#   gr1/robocasa (29D)                    -> [0:29]
+#   real-world (8D)                      -> [29:37] (joints + gripper)
+
+ACTION_REPRESENTATION_SLICES = {
+    # Single-arm (7D) -> right_arm slot [7:14] (single-arm default to right hand)
+    "franka": slice(7, 14),
+    "libero_franka": slice(7, 14),
+    "oxe_droid": slice(7, 14),
+    "oxe_rt1": slice(7, 14),
+    "oxe_bridge": slice(7, 14),
+
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+
+    # Humanoid (29D) -> full [0:29], standard vector 30D (index 29 pad 0)
+    "gr1": slice(0, 29),
+    "fourier_gr1_arms_waist": slice(0, 29),
+
+    # Real-world (8D) -> [29:37] (joints + gripper)
+    "real_world_franka": slice(29, 37),
+
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(7, 14),
+}
+
+STANDARD_STATE_DIM = 88
+# Mapping:
+#   robotwin (14D)              -> [0:14] (left [0:7] + right [7:14])
+#   libero/bridge/fractal (8D)  -> [14:22] (right slot)
+#   real-world (8D)             -> [22:30] (joints + gripper)
+#   gr1 (58D after sin/cos)     -> [30:88] (isolated, different transform)
+
+STATE_REPRESENTATION_SLICES = {
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+    # Single-arm (8D) -> right slot [7:15] (aligned with action right [7:14])
+    "franka": slice(14, 22),
+    "libero_franka": slice(14, 22),
+    "oxe_droid": slice(14, 22),
+    "oxe_rt1": slice(14, 22),
+    "oxe_bridge": slice(14, 22),
+    # Real-world (8D) -> [22:30] (joints + gripper)
+    "real_world_franka": slice(22, 30),
+    # GR1 isolated [30:88] (58D, has StateActionSinCosTransform - different pipeline)
+    "gr1": slice(30, 88),
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(14, 22),
+}
+
+
+def standardize_action_representation(
+    action: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot action to a fixed-size standard action vector."""
+    target_slice = ACTION_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in ACTION_REPRESENTATION_SLICES:
+             target_slice = ACTION_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for action mapping. "
+                f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if action.shape[-1] != expected_dim:
+        raise ValueError(
+            f"Action dim mismatch for tag '{embodiment_tag}': "
+            f"{action.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*action.shape[:-1], STANDARD_ACTION_DIM), dtype=action.dtype
+    )
+    standard[..., target_slice] = action
+    return standard
+
+
+def standardize_state_representation(
+    state: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot state to a fixed-size standard state vector."""
+
+    target_slice = STATE_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in STATE_REPRESENTATION_SLICES:
+             target_slice = STATE_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for state mapping. "
+                f"Known tags: {sorted(STATE_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if state.shape[-1] != expected_dim:
+        raise ValueError(
+            f"State dim mismatch for tag '{embodiment_tag}': "
+            f"{state.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*state.shape[:-1], STANDARD_STATE_DIM), dtype=state.dtype
+    )
+    standard[..., target_slice] = state
+    return standard
+
+
+def calculate_dataset_statistics(parquet_paths: list[Path]) -> dict:
+    """Calculate the dataset statistics of all columns for a list of parquet files."""
+    # Dataset statistics
+    all_low_dim_data_list = []
+    # Collect all the data
+    # parquet_paths = parquet_paths[:3]
+    for parquet_path in tqdm(
+        sorted(list(parquet_paths)),
+        desc="Collecting all parquet files...",
+    ):
+        # Load the parquet file
+        parquet_data = pd.read_parquet(parquet_path)
+        parquet_data = parquet_data
+        all_low_dim_data_list.append(parquet_data)
+    
+    all_low_dim_data = pd.concat(all_low_dim_data_list, axis=0)
+    # Compute dataset statistics
+    dataset_statistics = {}
+    for le_modality in all_low_dim_data.columns:
+        if le_modality.startswith("annotation."):
+            continue
+        print(f"Computing statistics for {le_modality}...")
+        np_data = np.vstack(
+            [np.asarray(x, dtype=np.float32) for x in all_low_dim_data[le_modality]]
+        )
+        dataset_statistics[le_modality] = {
+            "mean": np.mean(np_data, axis=0).tolist(),
+            "std": np.std(np_data, axis=0).tolist(),
+            "min": np.min(np_data, axis=0).tolist(),
+            "max": np.max(np_data, axis=0).tolist(),
+            "q01": np.quantile(np_data, 0.01, axis=0).tolist(),
+            "q99": np.quantile(np_data, 0.99, axis=0).tolist(),
+        }
+    return dataset_statistics
+
+
+class ModalityConfig(BaseModel):
+    """Configuration for a modality."""
+
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+
+
+class LeRobotSingleDataset(Dataset):
+    """
+    Base dataset class for LeRobot that supports sharding.
+    """
+    def __init__(
+        self,
+        dataset_path: Path | str,
+        modality_configs: dict[str, ModalityConfig],
+        embodiment_tag: str | EmbodimentTag,
+        video_backend: str = "decord",
+        video_backend_kwargs: dict | None = None,
+        transforms: ComposedModalityTransform | None = None,
+        delete_pause_frame: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset.
+
+        Args:
+            dataset_path (Path | str): The path to the dataset.
+            modality_configs (dict[str, ModalityConfig]): The configuration for each modality. The keys are the modality names, and the values are the modality configurations.
+                See `ModalityConfig` for more details.
+            video_backend (str): Backend for video reading.
+            video_backend_kwargs (dict): Keyword arguments for the video backend when initializing the video reader.
+            transforms (ComposedModalityTransform): The transforms to apply to the dataset.
+            embodiment_tag (EmbodimentTag): Overload the embodiment tag for the dataset. e.g. define it as "new_embodiment"
+        """
+        # first check if the path directory exists
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset path {dataset_path} does not exist")
+        data_cfg = kwargs.get("data_cfg", {}) or {}
+        # indict letobot version
+        self._lerobot_version = data_cfg.get("lerobot_version", "v2.0") #self._indict_lerobot_version(**kwargs)
+        self.load_video = data_cfg.get("load_video", True)
+
+        self.delete_pause_frame = delete_pause_frame
+
+        # If video loading is disabled, skip video modality end-to-end.
+        if self.load_video:
+            self.modality_configs = modality_configs
+        else:
+            self.modality_configs = {
+                modality: config
+                for modality, config in modality_configs.items()
+                if modality != "video"
+            }
+        self.video_backend = video_backend
+        self.video_backend_kwargs = video_backend_kwargs if video_backend_kwargs is not None else {}
+        self.transforms = (
+            transforms if transforms is not None else ComposedModalityTransform(transforms=[])
+        )
+
+        self._dataset_path = Path(dataset_path)
+        self._dataset_name = self._dataset_path.name
+        self._dataset_id = DATASET_NAME_TO_ID.get(self._dataset_name)
+        if isinstance(embodiment_tag, EmbodimentTag):
+            self.tag = embodiment_tag.value
+        else:
+            self.tag = embodiment_tag
+
+        self._metadata = self._get_metadata(EmbodimentTag(self.tag))
+
+        # LeRobot-specific config
+        self._lerobot_modality_meta = self._get_lerobot_modality_meta()
+        self._lerobot_info_meta = self._get_lerobot_info_meta()
+        self._data_path_pattern = self._get_data_path_pattern()
+        self._video_path_pattern = self._get_video_path_pattern()
+        self._chunk_size = self._get_chunk_size()
+        self._tasks = self._get_tasks()
+        self.curr_traj_data = None
+        self.curr_traj_id = None
+
+        self._trajectory_ids, self._trajectory_lengths = self._get_trajectories()
+        self._modality_keys = self._get_modality_keys()
+        self._delta_indices = self._get_delta_indices()
+        self._all_steps = self._get_all_steps()
+        self.set_transforms_metadata(self.metadata)
+        self.set_epoch(0)
+
+        print(f"Initialized dataset {self.dataset_name} with {embodiment_tag}")
+
+
+        # Check if the dataset is valid
+        self._check_integrity()
+
+    @property
+    def dataset_path(self) -> Path:
+        """The path to the dataset that contains the METADATA_FILENAME file."""
+        return self._dataset_path
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """The metadata for the dataset, loaded from metadata.json in the dataset directory"""
+        return self._metadata
+
+    @property
+    def trajectory_ids(self) -> np.ndarray:
+        """The trajectory IDs in the dataset, stored as a 1D numpy array of strings."""
+        return self._trajectory_ids
+
+    @property
+    def trajectory_lengths(self) -> np.ndarray:
+        """The trajectory lengths in the dataset, stored as a 1D numpy array of integers.
+        The order of the lengths is the same as the order of the trajectory IDs.
+        """
+        return self._trajectory_lengths
+
+    @property
+    def all_steps(self) -> list[tuple[int, int]]:
+        """The trajectory IDs and base indices for all steps in the dataset.
+        Example:
+            self.trajectory_ids: [0, 1, 2]
+            self.trajectory_lengths: [3, 2, 4]
+            return: [
+                ("traj_0", 0), ("traj_0", 1), ("traj_0", 2),
+                ("traj_1", 0), ("traj_1", 1),
+                ("traj_2", 0), ("traj_2", 1), ("traj_2", 2), ("traj_2", 3)
+            ]
+        """
+        return self._all_steps
+
+    @property
+    def modality_keys(self) -> dict:
+        """The modality keys for the dataset. The keys are the modality names, and the values are the keys for each modality.
+
+        Example: {
+            "video": ["video.image_side_0", "video.image_side_1"],
+            "state": ["state.eef_position", "state.eef_rotation"],
+            "action": ["action.eef_position", "action.eef_rotation"],
+            "language": ["language.human.task"],
+            "timestamp": ["timestamp"],
+            "reward": ["reward"],
+        }
+        """
+        return self._modality_keys
+
+    @property
+    def delta_indices(self) -> dict[str, np.ndarray]:
+        """The delta indices for the dataset. The keys are the modality.key, and the values are the delta indices for each modality.key."""
+        return self._delta_indices
+
+    @property
+    def dataset_name(self) -> str:
+        """The name of the dataset."""
+        return self._dataset_name
+
+    @property
+    def lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_modality_meta
+
+    @property
+    def lerobot_info_meta(self) -> dict:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_info_meta
+
+    @property
+    def data_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._data_path_pattern
+
+    @property
+    def video_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._video_path_pattern
+
+    @property
+    def chunk_size(self) -> int:
+        """The chunk size for the LeRobot dataset."""
+        return self._chunk_size
+
+    @property
+    def tasks(self) -> pd.DataFrame:
+        """The tasks for the dataset."""
+        return self._tasks
+
+    def _get_metadata(self, embodiment_tag: EmbodimentTag) -> DatasetMetadata:
+        """Get the metadata for the dataset.
+
+        Returns:
+            dict: The metadata for the dataset.
+        """
+
+        # 1. Modality metadata
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        # 1.1. State and action modalities
+        simplified_modality_meta: dict[str, dict] = {}
+        with open(modality_meta_path, "r") as f:
+            le_modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        for modality in ["state", "action"]:
+            simplified_modality_meta[modality] = {}
+            le_state_action_meta: dict[str, LeRobotStateActionMetadata] = getattr(
+                le_modality_meta, modality
+            )
+            for subkey in le_state_action_meta:
+                state_action_dtype = np.dtype(le_state_action_meta[subkey].dtype)
+                if np.issubdtype(state_action_dtype, np.floating):
+                    continuous = True
+                else:
+                    continuous = False
+                simplified_modality_meta[modality][subkey] = {
+                    "absolute": le_state_action_meta[subkey].absolute,
+                    "rotation_type": le_state_action_meta[subkey].rotation_type,
+                    "shape": [
+                        le_state_action_meta[subkey].end - le_state_action_meta[subkey].start
+                    ],
+                    "continuous": continuous,
+                }
+
+        # 1.2. Video modalities
+        le_info_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        assert (
+            le_info_path.exists()
+        ), f"Please provide a {LE_ROBOT_INFO_FILENAME} file in {self.dataset_path}"
+        with open(le_info_path, "r") as f:
+            le_info = json.load(f)
+        simplified_modality_meta["video"] = {}
+        for new_key in le_modality_meta.video:
+            original_key = le_modality_meta.video[new_key].original_key
+            if original_key is None:
+                original_key = new_key
+            le_video_meta = le_info["features"][original_key]
+            height = le_video_meta["shape"][le_video_meta["names"].index("height")]
+            width = le_video_meta["shape"][le_video_meta["names"].index("width")]
+            # NOTE(FH): different lerobot dataset versions have different keys for the number of channels and fps
+            try:
+                channels = le_video_meta["shape"][le_video_meta["names"].index("channel")]
+                fps = le_video_meta["video_info"]["video.fps"]
+            except (ValueError, KeyError):
+                # channels = le_video_meta["shape"][le_video_meta["names"].index("channels")]
+                channels = le_video_meta["info"]["video.channels"]
+                fps = le_video_meta["info"]["video.fps"]
+            simplified_modality_meta["video"][new_key] = {
+                "resolution": [width, height],
+                "channels": channels,
+                "fps": fps,
+            }
+
+        # 2. Dataset statistics
+        stats_path = self.dataset_path / LE_ROBOT_STATS_FILENAME
+        try:
+            with open(stats_path, "r") as f:
+                le_statistics = json.load(f)
+            for stat in le_statistics.values():
+                DatasetStatisticalValues.model_validate(stat)
+        except (FileNotFoundError, ValidationError) as e:
+            print(f"Failed to load dataset statistics: {e}")
+            print(f"Calculating dataset statistics for {self.dataset_name}")
+            # Get all parquet files in the dataset paths
+            parquet_files = list((self.dataset_path).glob(LE_ROBOT_DATA_FILENAME))
+            parquet_files_filtered = []
+            #  parquet_files[0].name = "episode_033675.parquet" is broken file
+            for pf in parquet_files:
+                if "episode_033675.parquet" in pf.name:
+                    continue
+                parquet_files_filtered.append(pf)
+            
+            le_statistics = calculate_dataset_statistics(parquet_files_filtered)
+            with open(stats_path, "w") as f:
+                json.dump(le_statistics, f, indent=4)
+        dataset_statistics = {}
+        for our_modality in ["state", "action"]:
+            dataset_statistics[our_modality] = {}
+            for subkey in simplified_modality_meta[our_modality]:
+                dataset_statistics[our_modality][subkey] = {}
+                state_action_meta = le_modality_meta.get_key_meta(f"{our_modality}.{subkey}")
+                assert isinstance(state_action_meta, LeRobotStateActionMetadata)
+                le_modality = state_action_meta.original_key
+                for stat_name in le_statistics[le_modality]:
+                    indices = np.arange(
+                        state_action_meta.start,
+                        state_action_meta.end,
+                    )
+                    stat = np.array(le_statistics[le_modality][stat_name])
+                    dataset_statistics[our_modality][subkey][stat_name] = stat[indices].tolist()
+
+        # 3. Full dataset metadata
+        metadata = DatasetMetadata(
+            statistics=dataset_statistics,  # type: ignore
+            modalities=simplified_modality_meta,  # type: ignore
+            embodiment_tag=embodiment_tag,
+        )
+
+        return metadata
+
+    def _get_trajectories(self) -> tuple[np.ndarray, np.ndarray]:
+        """Get the trajectories in the dataset."""
+        # Get trajectory lengths, IDs, and whitelist from dataset metadata
+        # v2.0
+        if self._lerobot_version == "v2.0":
+            file_path = self.dataset_path / LE_ROBOT_EPISODE_FILENAME
+            with open(file_path, "r") as f:
+                episode_metadata = [json.loads(line) for line in f]
+            trajectory_ids = []
+            trajectory_lengths = []
+            for episode in episode_metadata:
+                trajectory_ids.append(episode["episode_index"])
+                trajectory_lengths.append(episode["length"])
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+        # v3.0
+        elif self._lerobot_version == "v3.0":
+            file_paths = list((self.dataset_path).glob(LE_ROBOT3_EPISODE_FILENAME))
+            trajectory_ids = []
+            trajectory_lengths = []
+            # data_chunck_index = []
+            # data_file_index = []
+            # vido_from_index = []
+            self.trajectory_ids_to_metadata = {}
+            for file_path in file_paths:
+                episodes_data = pd.read_parquet(file_path)
+                for index, episode in episodes_data.iterrows():
+                    trajectory_ids.append(episode["episode_index"])
+                    trajectory_lengths.append(episode["length"])
+
+                    # TODO auto map key? just map to file_path and file_from_index
+                    episode_meta = {
+                        "data/chunk_index": episode["data/chunk_index"],
+                        "data/file_index": episode["data/file_index"],
+                        "data/file_from_index": index,
+                    }
+                    if self.load_video:
+                        episode_meta["videos/observation.images.wrist/from_timestamp"] = episode[
+                            "videos/observation.images.wrist/from_timestamp"
+                        ]
+                    self.trajectory_ids_to_metadata[trajectory_ids[-1]] = episode_meta
+
+            # 这里应该可以直接读取到 save index 信息
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+
+    def _get_all_steps(self) -> list[tuple[int, int]]:
+        """Get the trajectory IDs and base indices for all steps in the dataset.
+
+        Returns:
+            list[tuple[str, int]]: A list of (trajectory_id, base_index) tuples.
+        """
+        # Create a hash key based on configuration to ensure cache validity
+        config_key = self._get_steps_config_key()
+        
+        # Create a unique filename based on config_key
+        # steps_filename = f"steps_{config_key}.pkl"
+        # @BUG
+        # fast get static steps @fangjing --> don't use hash to dynamic sample
+        steps_filename =  "steps_data_index.pkl"
+
+
+        steps_path = self.dataset_path / "meta" / steps_filename
+        
+        # Try to load cached steps first
+        try:
+            if steps_path.exists():
+                with open(steps_path, "rb") as f:
+                    cached_data = pickle.load(f)
+                return cached_data["steps"]
+
+        except (FileNotFoundError, pickle.PickleError, KeyError) as e:
+            print(f"Failed to load cached steps: {e}")
+            print("Computing steps from scratch...")
+
+        # Compute steps using single process
+        all_steps = self._get_all_steps_single_process()
+        
+        # Cache the computed steps with unique filename
+        try:
+            cache_data = {
+                "config_key": config_key,
+                "steps": all_steps,
+                "num_trajectories": len(self.trajectory_ids),
+                "total_steps": len(all_steps),
+                "computed_timestamp": pd.Timestamp.now().isoformat(),
+                "delete_pause_frame": self.delete_pause_frame,
+            }
+            
+            # Ensure the meta directory exists
+            steps_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(steps_path, "wb") as f:
+                pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
+            print(f"Cached steps saved to {steps_path}")
+        except Exception as e:
+            print(f"Failed to cache steps: {e}")
+        
+        return all_steps
+
+    def _get_steps_config_key(self) -> str:
+        """Generate a configuration key for steps caching."""
+        config_dict = {
+            "delete_pause_frame": self.delete_pause_frame,
+            "dataset_name": self.dataset_name,
+        }
+        # Create a hash of the configuration
+        config_str = str(sorted(config_dict.items()))
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]  #
+
+
+    def _get_all_steps_single_process(self) -> list[tuple[int, int]]:
+        """Original single-process implementation as fallback."""
+        all_steps: list[tuple[int, int]] = []
+        skipped_trajectories = 0
+        processed_trajectories = 0
+        
+        # Check if language modality is configured
+        has_language_modality = 'language' in self.modality_keys and len(self.modality_keys['language']) > 0
+        # TODO why trajectory_length here, why not use data length?
+        for trajectory_id, trajectory_length in tqdm(zip(self.trajectory_ids, self.trajectory_lengths), total=len(self.trajectory_ids), desc="Getting All Step"):
+            try:
+                if self._lerobot_version == "v2.0":
+                    data = self.get_trajectory_data(trajectory_id)
+                elif self._lerobot_version == "v3.0":
+                    data = self.get_trajectory_data_lerobot_v3(trajectory_id)
+                
+                trajectory_skipped = False
+            
+                # Check if trajectory has valid language instruction (if language modality is configured)
+                if has_language_modality:
+                    self.curr_traj_data = data  # Set current trajectory data for get_language to work
+
+                    language_instruction = self.get_language(trajectory_id, self.modality_keys['language'][0], 0)
+                    if not language_instruction or language_instruction[0] == "":
+                        print(f"Skipping trajectory {trajectory_id} due to empty language instruction")
+                        skipped_trajectories += 1
+                        trajectory_skipped = True
+                        continue
+
+            except Exception as e:
+                print(f"Skipping trajectory {trajectory_id} due to read error: {e}")
+                skipped_trajectories += 1
+                trajectory_skipped = True
+                continue
+        
+            if not trajectory_skipped:
+                processed_trajectories += 1
+        
+            for base_index in range(trajectory_length):
+                all_steps.append((trajectory_id, base_index))
+                
+        # Print summary statistics
+        print(f"Single-process summary: Processed {processed_trajectories} trajectories, skipped {skipped_trajectories} empty trajectories")
+        print(f"Total steps: {len(all_steps)} from {len(self.trajectory_ids)} trajectories")
+                   
+        return all_steps
+
+    def _get_position_and_gripper_values(self, data: pd.DataFrame) -> tuple[list, list]:
+        """Get position and gripper values based on available columns in the dataset."""
+        # Get action keys from modality_keys
+        action_keys = self.modality_keys.get('action', [])
+        
+        # Extract position data
+        delta_position_values = None
+        position_candidates = ['delta_eef_position']
+        coordinate_candidates = ['x', 'y', 'z']
+        
+        # First try combined position fields
+        for pos_key in position_candidates:
+            full_key = f"action.{pos_key}"
+            if full_key in action_keys:
+                try:
+                    # Get the lerobot key for this modality
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    subkey = pos_key
+                    if subkey in le_action_cfg:
+                        le_key = le_action_cfg[subkey].original_key or subkey
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[subkey].start, le_action_cfg[subkey].end)
+                            filtered_data = data_array[:, le_indices]
+                            delta_position_values = filtered_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        # If combined fields not found, try individual x,y,z coordinates
+        if delta_position_values is None:
+            x_data, y_data, z_data = None, None, None
+            for coord in coordinate_candidates:
+                full_key = f"action.{coord}"
+                if full_key in action_keys:
+                    try:
+                        le_action_cfg = self.lerobot_modality_meta.action
+                        if coord in le_action_cfg:
+                            le_key = le_action_cfg[coord].original_key or coord
+                            if le_key in data.columns:
+                                data_array = np.stack(data[le_key])
+                                le_indices = np.arange(le_action_cfg[coord].start, le_action_cfg[coord].end)
+                                coord_data = data_array[:, le_indices].flatten()
+                                if coord == 'x':
+                                    x_data = coord_data
+                                elif coord == 'y':
+                                    y_data = coord_data
+                                elif coord == 'z':
+                                    z_data = coord_data
+                    except Exception:
+                        continue
+            
+            if x_data is not None and y_data is not None and z_data is not None:
+                delta_position_values = np.column_stack((x_data, y_data, z_data)).tolist()
+        
+        if delta_position_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.delta_eef_position' in data.columns:
+                delta_position_values = data['action.delta_eef_position'].to_numpy().tolist()
+            elif all(col in data.columns for col in ['action.x', 'action.y', 'action.z']):
+                x_vals = data['action.x'].to_numpy()
+                y_vals = data['action.y'].to_numpy() 
+                z_vals = data['action.z'].to_numpy()
+                delta_position_values = np.column_stack((x_vals, y_vals, z_vals)).tolist()
+            else:
+                raise ValueError(f"No suitable position columns found. Available columns: {data.columns.tolist()}")
+        
+        # Extract gripper data
+        gripper_values = None
+        gripper_candidates = ['gripper_close', 'gripper']
+        
+        for grip_key in gripper_candidates:
+            full_key = f"action.{grip_key}"
+            if full_key in action_keys:
+                try:
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    if grip_key in le_action_cfg:
+                        le_key = le_action_cfg[grip_key].original_key or grip_key
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[grip_key].start, le_action_cfg[grip_key].end)
+                            gripper_data = data_array[:, le_indices].flatten()
+                            gripper_values = gripper_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        if gripper_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.gripper_close' in data.columns:
+                gripper_values = data['action.gripper_close'].to_numpy().tolist()
+            elif 'action.gripper' in data.columns:
+                gripper_values = data['action.gripper'].to_numpy().tolist()
+            else:
+                raise ValueError(f"No suitable gripper columns found. Available columns: {data.columns.tolist()}")
+        
+        return delta_position_values, gripper_values
+
+    def _get_modality_keys(self) -> dict:
+        """Get the modality keys for the dataset.
+        The keys are the modality names, and the values are the keys for each modality.
+        See property `modality_keys` for the expected format.
+        """
+        modality_keys = defaultdict(list)
+        for modality, config in self.modality_configs.items():
+            modality_keys[modality] = config.modality_keys
+        return modality_keys
+
+    def _get_delta_indices(self) -> dict[str, np.ndarray]:
+        """Restructure the delta indices to use modality.key as keys instead of just the modalities."""
+        delta_indices: dict[str, np.ndarray] = {}
+        for config in self.modality_configs.values():
+            for key in config.modality_keys:
+                delta_indices[key] = np.array(config.delta_indices)
+        return delta_indices
+
+    def _get_lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """Get the metadata for the LeRobot dataset."""
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        with open(modality_meta_path, "r") as f:
+            modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        return modality_meta
+
+    def _get_lerobot_info_meta(self) -> dict:
+        """Get the metadata for the LeRobot dataset."""
+        info_meta_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        with open(info_meta_path, "r") as f:
+            info_meta = json.load(f)
+        return info_meta
+
+    def _get_data_path_pattern(self) -> str:
+        """Get the data path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["data_path"]
+
+    def _get_video_path_pattern(self) -> str:
+        """Get the video path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["video_path"]
+
+    def _get_chunk_size(self) -> int:
+        """Get the chunk size for the LeRobot dataset."""
+        return self.lerobot_info_meta["chunks_size"]
+
+    def _get_tasks(self) -> pd.DataFrame:
+        """Get the tasks for the dataset."""
+        if self._lerobot_version == "v2.0":
+            tasks_path = self.dataset_path / LE_ROBOT_TASKS_FILENAME
+            with open(tasks_path, "r") as f:
+                tasks = [json.loads(line) for line in f]
+            df = pd.DataFrame(tasks)
+            return df.set_index("task_index")
+        
+        elif self._lerobot_version == "v3.0":
+            tasks_path = self.dataset_path / LE_ROBOT3_TASKS_FILENAME
+            df = pd.read_parquet(tasks_path)
+            df = df.reset_index()  # 把索引变成一列，列名通常为 'index'
+            df = df.rename(columns={'index': 'task'})  # 把 'index' 列重命名为 'task'
+            df = df[['task_index', 'task']]  # 调整列顺序
+            return df
+    def _check_integrity(self):
+        """Use the config to check if the keys are valid and detect silent data corruption."""
+        ERROR_MSG_HEADER = f"Error occurred in initializing dataset {self.dataset_name}:\n"
+
+        for modality_config in self.modality_configs.values():
+            for key in modality_config.modality_keys:
+                if key == "lapa_action" or key == "dream_actions":
+                    continue  # no need for any metadata for lapa actions because it comes normalized
+                # Check if the key is valid
+                try:
+                    self.lerobot_modality_meta.get_key_meta(key)
+                except Exception as e:
+                    raise ValueError(
+                        ERROR_MSG_HEADER + f"Unable to find key {key} in modality metadata:\n{e}"
+                    )
+
+    def set_transforms_metadata(self, metadata: DatasetMetadata):
+        """Set the metadata for the transforms. This is useful for transforms that need to know the metadata, such as the normalization values."""
+        self.transforms.set_metadata(metadata)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+
+    def __len__(self) -> int:
+        """Get the total number of data points in the dataset.
+
+        Returns:
+            int: the total number of data points in the dataset.
+        """
+        return len(self.all_steps)
+
+    def __str__(self) -> str:
+        """Get the description of the dataset."""
+        return f"{self.dataset_name} ({len(self)} steps)"
+
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single step in a trajectory.
+
+        Args:
+            index (int): The index of the step to get.
+
+        Returns:
+            dict: The data for the step.
+        """
+        trajectory_id, base_index = self.all_steps[index]
+        data = self.get_step_data(trajectory_id, base_index)
+        
+        # Process all video keys dynamically
+        images = []
+        for video_key in self.modality_keys.get("video", []):
+            image = data[video_key][0]
+
+            image = Image.fromarray(image).resize((224, 224))
+            images.append(image)
+        
+        # Get language and action data
+        language = data[self.modality_keys["language"][0]][0]
+        action = []
+        for action_key in self.modality_keys["action"]:
+            action.append(data[action_key])
+        action = np.concatenate(action, axis=1)
+        action = standardize_action_representation(action, self.tag)
+
+        state = []
+        for state_key in self.modality_keys["state"]:
+            state.append(data[state_key])
+        state = np.concatenate(state, axis=1)
+        state = standardize_state_representation(state, self.tag)
+        
+        return dict(action=action, state=state, image=images, language=language, dataset_id=self._dataset_id)
+
+    def get_step_data(self, trajectory_id: int, base_index: int) -> dict:
+        """Get the RAW data for a single step in a trajectory. No transforms are applied.
+
+        Args:
+            trajectory_id (int): The name of the trajectory.
+            base_index (int): The base step index in the trajectory.
+
+        Returns:
+            dict: The RAW data for the step.
+
+        Example return:
+            {
+                "video": {
+                    "video.image_side_0": [B, T, H, W, C],
+                    "video.image_side_1": [B, T, H, W, C],
+                },
+                "state": {
+                    "state.eef_position": [B, T, state_dim],
+                    "state.eef_rotation": [B, T, state_dim],
+                },
+                "action": {
+                    "action.eef_position": [B, T, action_dim],
+                    "action.eef_rotation": [B, T, action_dim],
+                },
+            }
+        """
+        data = {}
+        # Get the data for all modalities # just for action base data
+        self.curr_traj_data = self.get_trajectory_data(trajectory_id)
+        # TODO @JinhuiYE The logic below is poorly implemented. Data reading should be directly based on curr_traj_data.
+        for modality in self.modality_keys:
+            # Get the data corresponding to each key in the modality
+            for key in self.modality_keys[modality]:
+                data[key] = self.get_data_by_modality(trajectory_id, modality, key, base_index)
+        return data
+
+    def get_trajectory_data(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory."""
+        if self._lerobot_version == "v2.0":
+        
+            if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+                return self.curr_traj_data
+            else:
+                chunk_index = self.get_episode_chunk(trajectory_id)
+                parquet_path = self.dataset_path / self.data_path_pattern.format(
+                    episode_chunk=chunk_index, episode_index=trajectory_id
+                )
+                assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+                return pd.read_parquet(parquet_path)
+        elif self._lerobot_version == "v3.0":
+            return self.get_trajectory_data_lerobot_v3(trajectory_id)
+    
+    def get_trajectory_data_lerobot_v3(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory from lerobot v3."""
+        if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+            return self.curr_traj_data
+        else: #TODO check detail later
+            chunk_index = self.get_episode_chunk(trajectory_id)
+
+            file_index = self.get_episode_file_index(trajectory_id)
+            # file_from_index = self.get_episode_file_from_index(trajectory_id)
+            
+            
+            parquet_path = self.dataset_path / self.data_path_pattern.format(
+                chunk_index=chunk_index, file_index=file_index
+            )
+            assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+            file_data = pd.read_parquet(parquet_path)
+            
+            # filter by trajectory_id
+            episode_data = file_data.loc[file_data["episode_index"] == trajectory_id].copy()
+            
+            # fix timestamp from epis index to file index for video alignment
+            if self.load_video:
+                from_timestamp = self.trajectory_ids_to_metadata[trajectory_id].get(
+                    "videos/observation.images.wrist/from_timestamp", 0
+                )
+                episode_data["timestamp"] = episode_data["timestamp"] + from_timestamp
+            
+            return episode_data
+
+
+    def get_trajectory_index(self, trajectory_id: int) -> int:
+        """Get the index of the trajectory in the dataset by the trajectory ID.
+        This is useful when you need to get the trajectory length or sampling weight corresponding to the trajectory ID.
+
+        Args:
+            trajectory_id (str): The ID of the trajectory.
+
+        Returns:
+            int: The index of the trajectory in the dataset.
+        """
+        trajectory_indices = np.where(self.trajectory_ids == trajectory_id)[0]
+        if len(trajectory_indices) != 1:
+            raise ValueError(
+                f"Error finding trajectory index for {trajectory_id}, found {trajectory_indices=}"
+            )
+        return trajectory_indices[0]
+
+    def get_episode_chunk(self, ep_index: int) -> int:
+        """Get the chunk index for an episode index."""
+        return ep_index // self.chunk_size
+    def get_episode_file_index(self, ep_index: int) -> int:
+        """Get the file index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_index"]
+    
+    def get_episode_file_from_index(self, ep_index: int) -> int:
+        """Get the file from index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_from_index"]
+
+
+    def retrieve_data_and_pad(
+        self,
+        array: np.ndarray,
+        step_indices: np.ndarray,
+        max_length: int,
+        padding_strategy: str = "first_last",
+    ) -> np.ndarray:
+        """Retrieve the data from the dataset and pad it if necessary.
+        Args:
+            array (np.ndarray): The array to retrieve the data from.
+            step_indices (np.ndarray): The step indices to retrieve the data for.
+            max_length (int): The maximum length of the data.
+            padding_strategy (str): The padding strategy, either "first" or "last".
+        """
+        # Get the padding indices
+        front_padding_indices = step_indices < 0
+        end_padding_indices = step_indices >= max_length
+        padding_positions = np.logical_or(front_padding_indices, end_padding_indices)
+        # Retrieve the data with the non-padding indices
+        # If there exists some padding, Given T step_indices, the shape of the retrieved data will be (T', ...) where T' < T
+        raw_data = array[step_indices[~padding_positions]]
+        assert isinstance(raw_data, np.ndarray), f"{type(raw_data)=}"
+        # This is the shape of the output, (T, ...)
+        if raw_data.ndim == 1:
+            expected_shape = (len(step_indices),)
+        else:
+            expected_shape = (len(step_indices), *array.shape[1:])
+
+        # Pad the data
+        output = np.zeros(expected_shape)
+        # Assign the non-padded data
+        output[~padding_positions] = raw_data
+        # If there exists some padding, pad the data
+        if padding_positions.any():
+            if padding_strategy == "first_last":
+                # Use first / last step data to pad
+                front_padding_data = array[0]
+                end_padding_data = array[-1]
+                output[front_padding_indices] = front_padding_data
+                output[end_padding_indices] = end_padding_data
+            elif padding_strategy == "zero":
+                # Use zero padding
+                output[padding_positions] = 0
+            else:
+                raise ValueError(f"Invalid padding strategy: {padding_strategy}")
+        return output
+
+    def get_video_path(self, trajectory_id: int, key: str) -> Path:
+        chunk_index = self.get_episode_chunk(trajectory_id)
+        original_key = self.lerobot_modality_meta.video[key].original_key
+        if original_key is None:
+            original_key = key
+        if self._lerobot_version == "v2.0":
+            video_filename = self.video_path_pattern.format(
+                episode_chunk=chunk_index, episode_index=trajectory_id, video_key=original_key
+            )
+        elif self._lerobot_version == "v3.0":
+            episode_meta = self.trajectory_ids_to_metadata[trajectory_id]
+            video_filename = self.video_path_pattern.format(
+                video_key=original_key,
+                chunk_index=episode_meta["data/chunk_index"],
+                file_index=episode_meta["data/file_index"],
+            )
+        return self.dataset_path / video_filename
+
+    def get_video(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the video frames for a trajectory by a base index.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (str): The ID of the trajectory.
+            key (str): The key of the video.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The video frames for the trajectory and frame indices. Shape: (T, H, W, C)
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # print(f"{step_indices=}")
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Ensure the indices are within the valid range
+        # This is equivalent to padding the video with extra frames at the beginning and end
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, self.trajectory_lengths[trajectory_index] - 1)
+        assert key.startswith("video."), f"Video key must start with 'video.', got {key}"
+        # Get the sub-key
+        key = key.replace("video.", "")
+        video_path = self.get_video_path(trajectory_id, key)
+        # Get the action/state timestamps for each frame in the video
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert "timestamp" in self.curr_traj_data.columns, f"No timestamp found in {trajectory_id=}"
+        timestamp: np.ndarray = self.curr_traj_data["timestamp"].to_numpy()
+        # Get the corresponding video timestamps from the step indices
+        video_timestamp = timestamp[step_indices]
+
+        return get_frames_by_timestamps(
+            video_path.as_posix(),
+            video_timestamp,
+            video_backend=self.video_backend, # TODO
+            video_backend_kwargs=self.video_backend_kwargs,
+        )
+
+    def get_state_or_action(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the state or action data for a trajectory by a base index.
+        If the step indices are out of range, pad with the data:
+            if the data is stored in absolute format, pad with the first or last step data;
+            otherwise, pad with zero.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The data for the trajectory and step indices.
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        assert key.startswith(modality + "."), f"{key} must start with {modality + '.'}, got {key}"
+        # Get the sub-key, e.g. state.joint_angles -> joint_angles
+        key = key.replace(modality + ".", "")
+        # Get the lerobot key
+        le_state_or_action_cfg = getattr(self.lerobot_modality_meta, modality)
+        le_key = le_state_or_action_cfg[key].original_key
+        if le_key is None:
+            le_key = key
+        # Get the data array, shape: (T, D)
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert le_key in self.curr_traj_data.columns, f"No {le_key} found in {trajectory_id=}"
+        data_array: np.ndarray = np.stack(self.curr_traj_data[le_key])  # type: ignore
+        assert data_array.ndim == 2, f"Expected 2D array, got key {le_key} is{data_array.shape} array"
+        le_indices = np.arange(
+            le_state_or_action_cfg[key].start,
+            le_state_or_action_cfg[key].end,
+        )
+        data_array = data_array[:, le_indices]
+        # Get the state or action configuration
+        state_or_action_cfg = getattr(self.metadata.modalities, modality)[key]
+
+        # Pad the data
+        return self.retrieve_data_and_pad(
+            array=data_array,
+            step_indices=step_indices,
+            max_length=max_length,
+            padding_strategy="first_last" if state_or_action_cfg.absolute else "zero",
+            # padding_strategy="zero",           # HACK for realdata
+        )
+
+    def get_language(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> list[str]:
+        """Get the language annotation data for a trajectory by step indices.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            key (str): The key of the annotation.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            list[str]: The annotation data for the trajectory and step indices. If no matching data is found, return empty strings.
+        """
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        # Get the end times corresponding to the closest indices
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, max_length - 1)
+        # Get the annotations
+        task_indices: list[int] = []
+        assert key.startswith(
+            "annotation."
+        ), f"Language key must start with 'annotation.', got {key}"
+        subkey = key.replace("annotation.", "")
+        annotation_meta = self.lerobot_modality_meta.annotation
+        assert annotation_meta is not None, f"Annotation metadata is None for {subkey}"
+        assert (
+            subkey in annotation_meta
+        ), f"Annotation key {subkey} not found in metadata, available annotation keys: {annotation_meta.keys()}"
+        subkey_meta = annotation_meta[subkey]
+        original_key = subkey_meta.original_key
+        if original_key is None:
+            original_key = key
+        for i in range(len(step_indices)): # 
+            # task_indices.append(self.curr_traj_data[original_key][step_indices[i]].item())
+            value = self.curr_traj_data[original_key].iloc[step_indices[i]] # TODO check v2.0 
+            task_indices.append(value if isinstance(value, (int, float)) else value.item())
+
+        return self.tasks.loc[task_indices]["task"].tolist()
+
+    def get_data_by_modality(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ):
+        """Get the data corresponding to the modality for a trajectory by a base index.
+        This method will call the corresponding helper method based on the modality.
+        See the helper methods for more details.
+        NOTE: For the language modality, the data is padded with empty strings if no matching data is found.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+        """
+        if modality == "video":
+            return self.get_video(trajectory_id, key, base_index)
+        elif modality == "state" or modality == "action":
+            return self.get_state_or_action(trajectory_id, modality, key, base_index)
+        elif modality == "language":
+            return self.get_language(trajectory_id, key, base_index)
+        else:
+            raise ValueError(f"Invalid modality: {modality}")
+
+    def _save_dataset_statistics_(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the dataset.
+        Key order follows modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Get used modality keys
+        used_action_keys, used_state_keys = get_used_modality_keys(self.modality_keys)
+        
+        # Organize statistics by tag
+        tag = self.tag
+        tag_stats = {}
+        
+        # Process action statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'action') and self.metadata.statistics.action:
+            action_stats = self.metadata.statistics.action
+            filtered_action_stats = {
+                key: action_stats[key]
+                for key in used_action_keys
+                if key in action_stats
+            }
+            
+            if filtered_action_stats:
+                # Combine statistics from filtered action sub-keys
+                combined_action_stats = combine_modality_stats(filtered_action_stats)
+                
+                # Add mask field based on whether it's gripper or not
+                mask = generate_action_mask_for_used_keys(
+                    self.metadata.modalities.action, filtered_action_stats.keys()
+                )
+                combined_action_stats["mask"] = mask
+                
+                tag_stats["action"] = combined_action_stats
+        
+        # Process state statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'state') and self.metadata.statistics.state:
+            state_stats = self.metadata.statistics.state
+            filtered_state_stats = {
+                key: state_stats[key]
+                for key in used_state_keys
+                if key in state_stats
+            }
+            
+            if filtered_state_stats:
+                combined_state_stats = combine_modality_stats(filtered_state_stats)
+                tag_stats["state"] = combined_state_stats
+        
+        # Add dataset counts
+        tag_stats["num_transitions"] = len(self)
+        tag_stats["num_trajectories"] = len(self.trajectory_ids)
+        
+        statistics_data[tag] = tag_stats
+        
+        # Save as JSON file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Single dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(used_action_keys)}")
+        print(f"Used state keys (reordered): {list(used_state_keys)}")
+
+
+
+class MixtureSpecElement(BaseModel):
+    dataset_path: list[Path] | Path = Field(..., description="The path to the dataset.")
+    dataset_weight: float = Field(..., description="The weight of the dataset in the mixture.")
+    distribute_weights: bool = Field(
+        default=False,
+        description="Whether to distribute the weights of the dataset across all the paths. If True, the weights will be evenly distributed across all the paths.",
+    )
+
+
+# Helper functions for dataset statistics
+
+def combine_modality_stats(modality_stats: dict) -> dict:
+    """
+    Combine statistics from all sub-keys under a modality.
+    
+    Args:
+        modality_stats (dict): Statistics for a modality, containing multiple sub-keys.
+                               Each sub-key contains DatasetStatisticalValues object.
+        
+    Returns:
+        dict: Combined statistics
+    """
+    combined_stats = {
+        "mean": [],
+        "std": [],
+        "max": [],
+        "min": [],
+        "q01": [],
+        "q99": []
+    }
+    
+    # Combine statistics in sub-key order
+    for subkey in modality_stats.keys():
+        subkey_stats = modality_stats[subkey]  # This is a DatasetStatisticalValues object
+        
+        # Convert DatasetStatisticalValues to dict-like access
+        for stat_name in ["mean", "std", "max", "min", "q01", "q99"]:
+            stat_value = getattr(subkey_stats, stat_name)
+            if isinstance(stat_value, (list, tuple)):
+                combined_stats[stat_name].extend(stat_value)
+            else:
+                # Handle NDArray case - convert to list
+                if hasattr(stat_value, 'tolist'):
+                    combined_stats[stat_name].extend(stat_value.tolist())
+                else:
+                    combined_stats[stat_name].append(float(stat_value))
+    
+    return combined_stats
+
+def generate_action_mask_for_used_keys(action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+    """
+    Generate mask based on action modalities, but only for used keys.
+    All dimensions are set to True so every channel is de/normalized.
+    
+    Args:
+        action_modalities (dict): Configuration information for action modalities.
+        used_action_keys_ordered: Iterable of actually used action keys in the correct order.
+        
+    Returns:
+        list[bool]: List of mask values
+    """
+    mask = []
+    
+    # Generate mask in the same order as the statistics were combined
+    for subkey in used_action_keys_ordered:
+        if subkey in action_modalities:
+            subkey_config = action_modalities[subkey]
+            
+            # Get dimension count from shape
+            if hasattr(subkey_config, 'shape') and len(subkey_config.shape) > 0:
+                dim_count = subkey_config.shape[0]
+            else:
+                dim_count = 1
+            
+            # Check if it's gripper-related
+            is_gripper = "gripper" in subkey.lower()
+            
+            # Generate mask value for each dimension
+            for _ in range(dim_count):
+                mask.append(not is_gripper)  # gripper is False, others are True
+    
+    return mask
+
+def get_used_modality_keys(modality_keys: dict) -> tuple[set, set]:
+    """Extract used action and state keys from modality configuration."""
+    used_action_keys = []
+    used_state_keys = []
+    
+    # Extract action keys (remove "action." prefix)
+    for action_key in modality_keys.get("action", []):
+        if action_key.startswith("action."):
+            clean_key = action_key.replace("action.", "")
+            used_action_keys.append(clean_key)
+    
+    # Extract state keys (remove "state." prefix)  
+    for state_key in modality_keys.get("state", []):
+        if state_key.startswith("state."):
+            clean_key = state_key.replace("state.", "")
+            used_state_keys.append(clean_key)
+    
+    return used_action_keys, used_state_keys
+
+
+def safe_hash(input_tuple):
+    # keep 128 bits of the hash
+    tuple_string = repr(input_tuple).encode("utf-8")
+    sha256 = hashlib.sha256()
+    sha256.update(tuple_string)
+
+    seed = int(sha256.hexdigest(), 16)
+
+    return seed & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+    
+
+class LeRobotMixtureDataset(Dataset):
+    """
+    A mixture of multiple datasets. This class samples a single dataset based on the dataset weights and then calls the `__getitem__` method of the sampled dataset.
+    It is recommended to modify the single dataset class instead of this class.
+    """
+
+    def __init__(
+        self,
+        data_mixture: Sequence[tuple[LeRobotSingleDataset, float]],
+        mode: str,
+        balance_dataset_weights: bool = True,
+        balance_trajectory_weights: bool = True,
+        seed: int = 42,
+        metadata_config: dict = {
+            "percentile_mixing_method": "min_max",
+        },
+        **kwargs,
+    ):
+        """
+        Initialize the mixture dataset.
+
+        Args:
+            data_mixture (list[tuple[LeRobotSingleDataset, float]]): Datasets and their corresponding weights.
+            mode (str): If "train", __getitem__ will return different samples every epoch; if "val" or "test", __getitem__ will return the same sample every epoch.
+            balance_dataset_weights (bool): If True, the weight of dataset will be multiplied by the total trajectory length of each dataset.
+            balance_trajectory_weights (bool): If True, sample trajectories within a dataset weighted by their length; otherwise, use equal weighting.
+            seed (int): Random seed for sampling.
+        """
+        datasets: list[LeRobotSingleDataset] = []
+        dataset_sampling_weights: list[float] = []
+        for dataset, weight in data_mixture:
+            # Check if dataset is valid and has data
+            if len(dataset) == 0:
+                print(f"Warning: Skipping empty dataset {dataset.dataset_name}")
+                continue
+            datasets.append(dataset)
+            dataset_sampling_weights.append(weight)
+        
+        if len(datasets) == 0:
+            raise ValueError("No valid datasets found in the mixture. All datasets are empty.")
+        
+        self.datasets = datasets
+        self.balance_dataset_weights = balance_dataset_weights
+        self.balance_trajectory_weights = balance_trajectory_weights
+        self.seed = seed
+        self.mode = mode
+
+        # Set properties for sampling
+
+        # 1. Dataset lengths
+        self._dataset_lengths = np.array([len(dataset) for dataset in self.datasets])
+        print(f"Dataset lengths: {self._dataset_lengths}")
+
+        # 2. Dataset sampling weights
+        self._dataset_sampling_weights = np.array(dataset_sampling_weights)
+        
+        if self.balance_dataset_weights:
+            self._dataset_sampling_weights *= self._dataset_lengths
+        
+        # Check for zero or negative weights before normalization
+        if np.any(self._dataset_sampling_weights <= 0):
+            print(f"Warning: Found zero or negative sampling weights: {self._dataset_sampling_weights}")
+            # Set minimum weight to prevent division issues
+            self._dataset_sampling_weights = np.maximum(self._dataset_sampling_weights, 1e-8)
+        
+        # Normalize weights
+        weights_sum = self._dataset_sampling_weights.sum()
+        if weights_sum == 0 or np.isnan(weights_sum):
+            print(f"Error: Invalid weights sum: {weights_sum}")
+            # Fallback to equal weights
+            self._dataset_sampling_weights = np.ones(len(self.datasets)) / len(self.datasets)
+            print(f"Fallback to equal weights")
+        else:
+            self._dataset_sampling_weights /= weights_sum
+
+        # 3. Trajectory sampling weights
+        self._trajectory_sampling_weights: list[np.ndarray] = []
+        for i, dataset in enumerate(self.datasets):
+            trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths))
+            if self.balance_trajectory_weights:
+                trajectory_sampling_weights *= dataset.trajectory_lengths
+            
+            # Check for zero or negative weights before normalization
+            if np.any(trajectory_sampling_weights <= 0):
+                print(f"Warning: Dataset {i} has zero or negative trajectory weights")
+                trajectory_sampling_weights = np.maximum(trajectory_sampling_weights, 1e-8)
+            
+            # Normalize weights
+            weights_sum = trajectory_sampling_weights.sum()
+            if weights_sum == 0 or np.isnan(weights_sum):
+                print(f"Error: Dataset {i} has invalid trajectory weights sum: {weights_sum}")
+                # Fallback to equal weights
+                trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths)) / len(dataset.trajectory_lengths)
+            else:
+                trajectory_sampling_weights /= weights_sum
+            
+            self._trajectory_sampling_weights.append(trajectory_sampling_weights)
+
+        # 4. Primary dataset indices
+        self._primary_dataset_indices = np.array(dataset_sampling_weights) == 1.0
+        if not np.any(self._primary_dataset_indices):
+            print(f"Warning: No dataset with weight 1.0 found. Original weights: {dataset_sampling_weights}")
+            # Fallback: use the dataset(s) with maximum weight as primary
+            max_weight = max(dataset_sampling_weights)
+            self._primary_dataset_indices = np.array(dataset_sampling_weights) == max_weight
+            print(f"Using datasets with maximum weight {max_weight} as primary: {self._primary_dataset_indices}")
+            
+        if not np.any(self._primary_dataset_indices):
+            # This should never happen, but just in case
+            print("Error: Still no primary dataset found. Using first dataset as primary.")
+            self._primary_dataset_indices = np.zeros(len(self.datasets), dtype=bool)
+            self._primary_dataset_indices[0] = True
+
+        # Set the epoch and sample the first epoch
+        self.set_epoch(0)
+
+        self.update_metadata(metadata_config)
+
+    @property
+    def dataset_lengths(self) -> np.ndarray:
+        """The lengths of each dataset."""
+        return self._dataset_lengths
+
+    @property
+    def dataset_sampling_weights(self) -> np.ndarray:
+        """The sampling weights for each dataset."""
+        return self._dataset_sampling_weights
+
+    @property
+    def trajectory_sampling_weights(self) -> list[np.ndarray]:
+        """The sampling weights for each trajectory in each dataset."""
+        return self._trajectory_sampling_weights
+
+    @property
+    def primary_dataset_indices(self) -> np.ndarray:
+        """The indices of the primary datasets."""
+        return self._primary_dataset_indices
+
+    def __str__(self) -> str:
+        dataset_descriptions = []
+        for dataset, weight in zip(self.datasets, self.dataset_sampling_weights):
+            dataset_description = {
+                "Dataset": str(dataset),
+                "Sampling weight": float(weight),
+            }
+            dataset_descriptions.append(dataset_description)
+        return json.dumps({"Mixture dataset": dataset_descriptions}, indent=2)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+        # self.sampled_steps = self.sample_epoch()
+
+    def sample_step(self, index: int) -> tuple[LeRobotSingleDataset, int, int]:
+        """Sample a single step from the dataset."""
+        # return self.sampled_steps[index]
+
+        # Set seed
+        seed = index if self.mode != "train" else safe_hash((self.epoch, index, self.seed))
+        rng = np.random.default_rng(seed)
+
+        # Sample dataset
+        dataset_index = rng.choice(len(self.datasets), p=self.dataset_sampling_weights)
+        dataset = self.datasets[dataset_index]
+
+        # Sample trajectory
+        # trajectory_index = rng.choice(
+        #     len(dataset.trajectory_ids), p=self.trajectory_sampling_weights[dataset_index]
+        # )
+        # trajectory_id = dataset.trajectory_ids[trajectory_index]
+
+        # # Sample step
+        # base_index = rng.choice(dataset.trajectory_lengths[trajectory_index])
+        # return dataset, trajectory_id, base_index
+        single_step_index = rng.choice(len(dataset.all_steps))
+        trajectory_id, base_index = dataset.all_steps[single_step_index]
+        return dataset, trajectory_id, base_index
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single trajectory and start index.
+
+        Args:
+            index (int): The index of the trajectory to get.
+
+        Returns:
+            dict: The data for the trajectory and start index.
+        """
+        max_retries = 10
+        last_exception = None
+        
+        for attempt in range(max_retries):
+            try:
+                dataset, trajectory_name, step = self.sample_step(index)
+                data_raw = dataset.get_step_data(trajectory_name, step)
+                data = dataset.transforms(data_raw)
+                
+                # Process all video keys dynamically
+                images = []
+                for video_key in dataset.modality_keys.get("video", []):
+                    image = data[video_key][0]
+                    
+                    image = Image.fromarray(image).resize((224, 224)) #TODO check if this is ok
+                    images.append(image)
+                
+                # Get language and action data
+                language = data[dataset.modality_keys["language"][0]][0]
+                action = []
+                for action_key in dataset.modality_keys["action"]:
+                    action.append(data[action_key])
+                action = np.concatenate(action, axis=1).astype(np.float16)
+                action = standardize_action_representation(action, dataset.tag)
+                
+                state = []
+                for state_key in dataset.modality_keys["state"]:
+                    state.append(data[state_key])
+                state = np.concatenate(state, axis=1).astype(np.float16)
+                state = standardize_state_representation(state, dataset.tag)
+                
+                return dict(action=action, state=state, image=images, lang=language, dataset_id=dataset._dataset_id)
+                
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    # Log the error but continue trying
+                    print(f"Attempt {attempt + 1}/{max_retries} failed for index {index}: {e}")
+                    print(f"Retrying with new sample...")
+                    # For retry, we can use a slightly different index to get a new sample
+                    # This helps avoid getting stuck on the same problematic sample
+                    index = random.randint(0, len(self) - 1)
+                else:
+                    # All retries exhausted
+                    print(f"All {max_retries} attempts failed for index {index}")
+                    print(f"Last error: {last_exception}")
+                    # Return a dummy sample or re-raise the exception
+                    raise last_exception
+
+    def __len__(self) -> int:
+        """Get the length of a single epoch in the mixture.
+
+        Returns:
+            int: The length of a single epoch in the mixture.
+        """
+        # Check for potential issues
+        if len(self.datasets) == 0:
+            return 0
+            
+        # Check if any dataset lengths are 0 or NaN
+        if np.any(self.dataset_lengths == 0) or np.any(np.isnan(self.dataset_lengths)):
+            print(f"Warning: Found zero or NaN dataset lengths: {self.dataset_lengths}")
+            # Filter out zero/NaN length datasets
+            valid_indices = (self.dataset_lengths > 0) & (~np.isnan(self.dataset_lengths))
+            if not np.any(valid_indices):
+                print("Error: All datasets have zero or NaN length")
+                return 0
+        else:
+            valid_indices = np.ones(len(self.datasets), dtype=bool)
+        
+        # Check if any sampling weights are 0 or NaN
+        if np.any(self.dataset_sampling_weights == 0) or np.any(np.isnan(self.dataset_sampling_weights)):
+            print(f"Warning: Found zero or NaN sampling weights: {self.dataset_sampling_weights}")
+            # Use only valid weights
+            valid_weights = (self.dataset_sampling_weights > 0) & (~np.isnan(self.dataset_sampling_weights))
+            valid_indices = valid_indices & valid_weights
+            if not np.any(valid_indices):
+                print("Error: All sampling weights are zero or NaN")
+                return 0
+        
+        # Check primary dataset indices
+        primary_and_valid = self.primary_dataset_indices & valid_indices
+        if not np.any(primary_and_valid):
+            print(f"Warning: No valid primary datasets found. Primary indices: {self.primary_dataset_indices}, Valid indices: {valid_indices}")
+            # Fallback: use the largest valid dataset
+            if np.any(valid_indices):
+                max_length = self.dataset_lengths[valid_indices].max()
+                print(f"Fallback: Using maximum dataset length: {max_length}")
+                return int(max_length)
+            else:
+                return 0
+        
+        # Calculate the ratio and get max
+        ratios = (self.dataset_lengths / self.dataset_sampling_weights)[primary_and_valid]
+        
+        # Check for NaN or inf in ratios
+        if np.any(np.isnan(ratios)) or np.any(np.isinf(ratios)):
+            print(f"Warning: Found NaN or inf in ratios: {ratios}")
+            print(f"Dataset lengths: {self.dataset_lengths[primary_and_valid]}")
+            print(f"Sampling weights: {self.dataset_sampling_weights[primary_and_valid]}")
+            # Filter out invalid ratios
+            valid_ratios = ratios[~np.isnan(ratios) & ~np.isinf(ratios)]
+            if len(valid_ratios) == 0:
+                print("Error: All ratios are NaN or inf")
+                return 0
+            max_ratio = valid_ratios.max()
+        else:
+            max_ratio = ratios.max()
+        
+        result = int(max_ratio)
+        if result == 0:
+            print(f"Warning: Dataset mixture length is 0")
+        return result
+
+    @staticmethod
+    def compute_overall_statistics(
+        per_task_stats: list[dict[str, dict[str, list[float] | np.ndarray]]],
+        dataset_sampling_weights: list[float] | np.ndarray,
+        percentile_mixing_method: str = "weighted_average",
+    ) -> dict[str, dict[str, list[float]]]:
+        """
+        Computes overall statistics from per-task statistics using dataset sample weights.
+
+        Args:
+            per_task_stats: List of per-task statistics.
+            Example format of one element in the per-task statistics list:
+                {
+                    "state.gripper": {
+                        "min": [...],
+                        "max": [...],
+                        "mean": [...],
+                        "std": [...],
+                        "q01": [...],
+                        "q99": [...],
+                    },
+                    ...
+                }
+            dataset_sampling_weights: List of sample weights for each task.
+            percentile_mixing_method: The method to mix the percentiles, either "weighted_average" or "weighted_std".
+
+        Returns:
+            A dict of overall statistics per modality.
+        """
+        # Normalize the sample weights to sum to 1
+        dataset_sampling_weights = np.array(dataset_sampling_weights)
+        normalized_weights = dataset_sampling_weights / dataset_sampling_weights.sum()
+
+        # Initialize overall statistics dict
+        overall_stats: dict[str, dict[str, list[float]]] = {}
+
+        # Get the list of modality keys
+        modality_keys = per_task_stats[0].keys()
+
+        for modality in modality_keys:
+            # Number of dimensions (assuming consistent across tasks)
+            num_dims = len(per_task_stats[0][modality]["mean"])
+
+            # Initialize accumulators for means and variances
+            weighted_means = np.zeros(num_dims)
+            weighted_squares = np.zeros(num_dims)
+
+            # Collect min, max, q01, q99 from all tasks
+            min_list = []
+            max_list = []
+            q01_list = []
+            q99_list = []
+
+            for task_idx, task_stats in enumerate(per_task_stats):
+                w_i = normalized_weights[task_idx]
+                stats = task_stats[modality]
+                means = np.array(stats["mean"])
+                stds = np.array(stats["std"])
+
+                # Update weighted sums for mean and variance
+                weighted_means += w_i * means
+                weighted_squares += w_i * (stds**2 + means**2)
+
+                # Collect min, max, q01, q99
+                min_list.append(stats["min"])
+                max_list.append(stats["max"])
+                q01_list.append(stats["q01"])
+                q99_list.append(stats["q99"])
+
+            # Compute overall mean
+            overall_mean = weighted_means.tolist()
+
+            # Compute overall variance and std deviation
+            overall_variance = weighted_squares - weighted_means**2
+            overall_std = np.sqrt(overall_variance).tolist()
+
+            # Compute overall min and max per dimension
+            overall_min = np.min(np.array(min_list), axis=0).tolist()
+            overall_max = np.max(np.array(max_list), axis=0).tolist()
+
+            # Compute overall q01 and q99 per dimension
+            # Use weighted average of per-task quantiles
+            q01_array = np.array(q01_list)
+            q99_array = np.array(q99_list)
+            if percentile_mixing_method == "weighted_average":
+                weighted_q01 = np.average(q01_array, axis=0, weights=normalized_weights).tolist()
+                weighted_q99 = np.average(q99_array, axis=0, weights=normalized_weights).tolist()
+                # std_q01 = np.std(q01_array, axis=0).tolist()
+                # std_q99 = np.std(q99_array, axis=0).tolist()
+                # print(modality)
+                # print(f"{std_q01=}, {std_q99=}")
+                # print(f"{weighted_q01=}, {weighted_q99=}")
+            elif percentile_mixing_method == "min_max":
+                weighted_q01 = np.min(q01_array, axis=0).tolist()
+                weighted_q99 = np.max(q99_array, axis=0).tolist()
+            else:
+                raise ValueError(f"Invalid percentile mixing method: {percentile_mixing_method}")
+
+            # Store the overall statistics for the modality
+            overall_stats[modality] = {
+                "min": overall_min,
+                "max": overall_max,
+                "mean": overall_mean,
+                "std": overall_std,
+                "q01": weighted_q01,
+                "q99": weighted_q99,
+            }
+
+        return overall_stats
+
+    @staticmethod
+    def merge_metadata(
+        metadatas: list[DatasetMetadata],
+        dataset_sampling_weights: list[float],
+        percentile_mixing_method: str,
+    ) -> DatasetMetadata:
+        """Merge multiple metadata into one."""
+        # Convert to dicts
+        metadata_dicts = [metadata.model_dump(mode="json") for metadata in metadatas]
+        # Create a new metadata dict
+        merged_metadata = {}
+
+        # Check all metadata have the same embodiment tag
+        assert all(
+            metadata.embodiment_tag == metadatas[0].embodiment_tag for metadata in metadatas
+        ), "All metadata must have the same embodiment tag"
+        merged_metadata["embodiment_tag"] = metadatas[0].embodiment_tag
+
+        # Merge the dataset statistics
+        dataset_statistics = {}
+        dataset_statistics["state"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["state"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        dataset_statistics["action"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["action"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        merged_metadata["statistics"] = dataset_statistics
+
+        # Merge the modality configs
+        modality_configs = defaultdict(set)
+        for metadata in metadata_dicts:
+            for modality, configs in metadata["modalities"].items():
+                modality_configs[modality].add(json.dumps(configs))
+        merged_metadata["modalities"] = {}
+        for modality, configs in modality_configs.items():
+            # Check that all modality configs correspond to the same tag matches
+            assert (
+                len(configs) == 1
+            ), f"Multiple modality configs for modality {modality}: {list(configs)}"
+            merged_metadata["modalities"][modality] = json.loads(configs.pop())
+
+        return DatasetMetadata.model_validate(merged_metadata)
+
+    def update_metadata(self, metadata_config: dict, cached_statistics_path: Path | str | None = None) -> None:
+        """
+        Merge multiple metadatas into one and set the transforms with the merged metadata.
+
+        Args:
+            metadata_config (dict): Configuration for the metadata.
+                "percentile_mixing_method": The method to mix the percentiles, either "weighted_average" or "min_max".
+                    weighted_average: Use the weighted average of the percentiles using the weight used in sampling the datasets.
+                    min_max: Use the min of the 1st percentile and max of the 99th percentile.
+        """
+        # If cached path is provided, try to load and apply
+        if cached_statistics_path is not None:
+            try:
+                cached_stats = self.load_merged_statistics(cached_statistics_path)
+                self.apply_cached_statistics(cached_stats)
+                return
+            except (FileNotFoundError, KeyError, ValidationError) as e:
+                print(f"Failed to load cached statistics: {e}")
+                print("Falling back to computing statistics from scratch...")
+
+        self.tag = EmbodimentTag.NEW_EMBODIMENT.value
+        self.merged_metadata: dict[str, DatasetMetadata] = {}
+        # Group metadata by tag
+        all_metadatas: dict[str, list[DatasetMetadata]] = {}
+        for dataset in self.datasets:
+            if dataset.tag not in all_metadatas:
+                all_metadatas[dataset.tag] = []
+            all_metadatas[dataset.tag].append(dataset.metadata)
+        for tag, metadatas in all_metadatas.items():
+            self.merged_metadata[tag] = self.merge_metadata(
+                metadatas=metadatas,
+                dataset_sampling_weights=self.dataset_sampling_weights.tolist(),
+                percentile_mixing_method=metadata_config["percentile_mixing_method"],
+            )
+        for dataset in self.datasets:
+            dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+
+    def save_dataset_statistics(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save merged dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the datasets.
+        Key order follows each tag's modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Keep key orders per embodiment tag (from modality config order)
+        tag_to_used_action_keys = {}
+        tag_to_used_state_keys = {}
+        for dataset in self.datasets:
+            if dataset.tag in tag_to_used_action_keys:
+                continue
+            used_action_keys, used_state_keys = get_used_modality_keys(dataset.modality_keys)
+            tag_to_used_action_keys[dataset.tag] = used_action_keys
+            tag_to_used_state_keys[dataset.tag] = used_state_keys
+        
+        # Organize statistics by tag
+        for tag, merged_metadata in self.merged_metadata.items():
+            tag_stats = {}
+            
+            # Process action statistics
+            if hasattr(merged_metadata.statistics, 'action') and merged_metadata.statistics.action:
+                action_stats = merged_metadata.statistics.action
+                
+                used_action_keys = tag_to_used_action_keys.get(tag, [])
+                filtered_action_stats = {
+                    key: action_stats[key]
+                    for key in used_action_keys
+                    if key in action_stats
+                }
+                
+                if filtered_action_stats:
+                    combined_action_stats = combine_modality_stats(filtered_action_stats)
+                    
+                    mask = generate_action_mask_for_used_keys(
+                        merged_metadata.modalities.action, filtered_action_stats.keys()
+                    )
+                    combined_action_stats["mask"] = mask
+                    
+                    tag_stats["action"] = combined_action_stats
+            
+            # Process state statistics
+            if hasattr(merged_metadata.statistics, 'state') and merged_metadata.statistics.state:
+                state_stats = merged_metadata.statistics.state
+                
+                used_state_keys = tag_to_used_state_keys.get(tag, [])
+                filtered_state_stats = {
+                    key: state_stats[key]
+                    for key in used_state_keys
+                    if key in state_stats
+                }
+                
+                if filtered_state_stats:
+                    combined_state_stats = combine_modality_stats(filtered_state_stats)
+                    tag_stats["state"] = combined_state_stats
+            
+            # Add dataset counts
+            tag_stats.update(self._get_dataset_counts(tag))
+            
+            statistics_data[tag] = tag_stats
+        
+        # Save file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Merged dataset statistics saved to: {save_path}")
+        print(f"Used action keys by tag: {tag_to_used_action_keys}")
+        print(f"Used state keys by tag: {tag_to_used_state_keys}")
+
+
+    def _combine_modality_stats(self, modality_stats: dict) -> dict:
+        """Backward compatibility wrapper."""
+        return combine_modality_stats(modality_stats)
+
+    def _generate_action_mask_for_used_keys(self, action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+        """Backward compatibility wrapper."""
+        return generate_action_mask_for_used_keys(action_modalities, used_action_keys_ordered)
+
+    def _get_dataset_counts(self, tag: str) -> dict:
+        """
+        Get dataset count information for specified tag.
+        
+        Args:
+            tag (str): embodiment tag
+            
+        Returns:
+            dict: Dictionary containing num_transitions and num_trajectories
+        """
+        num_transitions = 0
+        num_trajectories = 0
+        
+        # Count dataset information belonging to this tag
+        for dataset in self.datasets:
+            if dataset.tag == tag:
+                num_transitions += len(dataset)
+                num_trajectories += len(dataset.trajectory_ids)
+        
+        return {
+            "num_transitions": num_transitions,
+            "num_trajectories": num_trajectories
+        }
+
+    @classmethod
+    def load_merged_statistics(cls, load_path: Path | str) -> dict:
+        """
+        Load merged dataset statistics from file.
+        
+        Args:
+            load_path (Path | str): Path to the statistics file
+            
+        Returns:
+            dict: Dictionary containing merged statistics
+        """
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Statistics file not found: {load_path}")
+        
+        if load_path.suffix.lower() == '.json':
+            with open(load_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        elif load_path.suffix.lower() == '.pkl':
+            import pickle
+            with open(load_path, 'rb') as f:
+                return pickle.load(f)
+        else:
+            raise ValueError(f"Unsupported file format: {load_path.suffix}")
+
+    def apply_cached_statistics(self, cached_statistics: dict) -> None:
+        """
+        Apply cached statistics to avoid recomputation.
+        
+        Args:
+            cached_statistics (dict): Statistics loaded from file
+        """
+        # Validate that cached statistics match current datasets
+        if "metadata" in cached_statistics:
+            cached_dataset_names = set(cached_statistics["metadata"]["dataset_names"])
+            current_dataset_names = set(dataset.dataset_name for dataset in self.datasets)
+            
+            if cached_dataset_names != current_dataset_names:
+                print("Warning: Cached statistics dataset names don't match current datasets.")
+                print(f"Cached: {cached_dataset_names}")
+                print(f"Current: {current_dataset_names}")
+                return
+        
+        # Apply cached statistics
+        self.merged_metadata = {}
+        for tag, stats_data in cached_statistics.items():
+            if tag == "metadata":  # Skip metadata field
+                continue
+                
+            # Convert back to DatasetMetadata format
+            metadata_dict = {
+                "embodiment_tag": tag,
+                "statistics": {
+                    "action": {},
+                    "state": {}
+                },
+                "modalities": {}
+            }
+            
+            # Convert action statistics back
+            if "action" in stats_data:
+                action_data = stats_data["action"]
+                # This is simplified - you may need to split back to sub-keys
+                metadata_dict["statistics"]["action"] = action_data
+            
+            # Convert state statistics back
+            if "state" in stats_data:
+                state_data = stats_data["state"]
+                metadata_dict["statistics"]["state"] = state_data
+            
+            self.merged_metadata[tag] = DatasetMetadata.model_validate(metadata_dict)
+        
+        # Update transforms metadata for each dataset
+        for dataset in self.datasets:
+            if dataset.tag in self.merged_metadata:
+                dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+        
+        print(f"Applied cached statistics for {len(self.merged_metadata)} embodiment tags.")
+
diff --git a/code/dataloader/gr00t_lerobot/datasets_bak2.py b/code/dataloader/gr00t_lerobot/datasets_bak2.py
new file mode 100644
index 0000000000000000000000000000000000000000..43da9dc9614fcc36b3794695dc6a0b0d36cf7162
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/datasets_bak2.py
@@ -0,0 +1,2145 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+In this file, we define 3 types of datasets:
+1. LeRobotSingleDataset: a single dataset for a given embodiment tag
+2. LeRobotMixtureDataset: a mixture of datasets for a given list of embodiment tags
+3. CachedLeRobotSingleDataset: a single dataset for a given embodiment tag,
+                                with caching for the video frames
+
+See `scripts/load_dataset.py` for examples on how to use these datasets.
+"""
+import os
+import hashlib
+import json, torch
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import os, random
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from PIL import Image
+
+from starVLA.dataloader.gr00t_lerobot.video import get_all_frames, get_frames_by_timestamps
+
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import EmbodimentTag, DATASET_NAME_TO_ID
+from starVLA.dataloader.gr00t_lerobot.schema import (
+    DatasetMetadata,
+    DatasetStatisticalValues,
+    LeRobotModalityMetadata,
+    LeRobotStateActionMetadata,
+)
+from starVLA.dataloader.gr00t_lerobot.transform import ComposedModalityTransform
+
+from functools import partial
+from typing import Tuple, List
+import pickle
+
+# LeRobot v2.0 dataset file names 
+LE_ROBOT_MODALITY_FILENAME = "meta/modality.json"
+LE_ROBOT_EPISODE_FILENAME = "meta/episodes.jsonl"
+LE_ROBOT_TASKS_FILENAME = "meta/tasks.jsonl"
+LE_ROBOT_INFO_FILENAME = "meta/info.json"
+LE_ROBOT_STATS_FILENAME = "meta/stats_gr00t.json"
+LE_ROBOT_DATA_FILENAME = "data/*/*.parquet"
+LE_ROBOT_STEPS_FILENAME = "meta/steps.pkl"
+EPSILON = 5e-4
+
+#  LeRobot v3.0 dataset file names 
+LE_ROBOT3_TASKS_FILENAME = "meta/tasks.parquet"
+LE_ROBOT3_EPISODE_FILENAME = "meta/episodes/*/*.parquet"
+
+
+# =============================================================================
+# Unified Representation Layout & Helpers
+# =============================================================================
+
+STANDARD_ACTION_DIM = 37
+#
+# Unified action representation layout (0-based indices, Python slice is [start, stop)):
+# Keep only: libero_franka, gr1, real_world_franka.
+#
+# - 0:7   -> left_arm (7D): xyz, rpy/euler, gripper
+#            Used by: gr1 left_arm
+# - 7:14  -> right_arm (7D): same structure
+#            Used by: libero_franka; gr1 right_arm
+# - 14:20 -> left_hand (6D): gr1 only
+# - 20:26 -> right_hand (6D): gr1 only
+# - 26:29 -> waist (3D): gr1 only
+# - 29:37 -> joints + gripper (8D): real_world_franka only
+#
+# Mapping:
+#   libero_franka (7D)         -> [7:14] (right_arm slot)
+#   gr1 (29D)                  -> [0:29]
+#   real_world_franka (8D)     -> [29:37] (joints + gripper)
+
+ACTION_REPRESENTATION_SLICES = {
+    # Single-arm (7D) -> right_arm slot [7:14]
+    "franka": slice(7, 14),
+
+    # Humanoid (29D) -> full [0:29]
+    "gr1": slice(0, 29),
+
+    # Real-world (8D) -> [29:37] (joints + gripper)
+    "real_world_franka": slice(29, 37),
+}
+
+STANDARD_STATE_DIM = 74
+# Mapping:
+#   libero_franka (8D)      -> [0:8]
+#   real_world_franka (8D)  -> [8:16]
+#   gr1 (58D after sin/cos) -> [16:74]
+
+STATE_REPRESENTATION_SLICES = {
+    # Single-arm (8D)
+    "franka": slice(0, 8),
+    # Real-world (8D)
+    "real_world_franka": slice(8, 16),
+    # GR1 isolated (58D, has StateActionSinCosTransform - different pipeline)
+    "gr1": slice(16, 74),
+}
+
+
+def standardize_action_representation(
+    action: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot action to a fixed-size standard action vector."""
+    target_slice = ACTION_REPRESENTATION_SLICES.get(embodiment_tag)
+
+    # Only allow explicitly configured embodiment tags.
+    if target_slice is None:
+        raise ValueError(
+            f"Unknown embodiment tag '{embodiment_tag}' for action mapping. "
+            f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES)}"
+        )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if action.shape[-1] != expected_dim:
+        raise ValueError(
+            f"Action dim mismatch for tag '{embodiment_tag}': "
+            f"{action.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*action.shape[:-1], STANDARD_ACTION_DIM), dtype=action.dtype
+    )
+    standard[..., target_slice] = action
+    return standard
+
+
+def standardize_state_representation(
+    state: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot state to a fixed-size standard state vector."""
+
+    target_slice = STATE_REPRESENTATION_SLICES.get(embodiment_tag)
+
+    # Only allow explicitly configured embodiment tags.
+    if target_slice is None:
+        raise ValueError(
+            f"Unknown embodiment tag '{embodiment_tag}' for state mapping. "
+            f"Known tags: {sorted(STATE_REPRESENTATION_SLICES)}"
+        )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if state.shape[-1] != expected_dim:
+        raise ValueError(
+            f"State dim mismatch for tag '{embodiment_tag}': "
+            f"{state.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*state.shape[:-1], STANDARD_STATE_DIM), dtype=state.dtype
+    )
+    standard[..., target_slice] = state
+    return standard
+
+
+def calculate_dataset_statistics(parquet_paths: list[Path]) -> dict:
+    """Calculate the dataset statistics of all columns for a list of parquet files."""
+    # Dataset statistics
+    all_low_dim_data_list = []
+    # Collect all the data
+    # parquet_paths = parquet_paths[:3]
+    for parquet_path in tqdm(
+        sorted(list(parquet_paths)),
+        desc="Collecting all parquet files...",
+    ):
+        # Load the parquet file
+        parquet_data = pd.read_parquet(parquet_path)
+        parquet_data = parquet_data
+        all_low_dim_data_list.append(parquet_data)
+    
+    all_low_dim_data = pd.concat(all_low_dim_data_list, axis=0)
+    # Compute dataset statistics
+    dataset_statistics = {}
+    for le_modality in all_low_dim_data.columns:
+        if le_modality.startswith("annotation."):
+            continue
+        print(f"Computing statistics for {le_modality}...")
+        np_data = np.vstack(
+            [np.asarray(x, dtype=np.float32) for x in all_low_dim_data[le_modality]]
+        )
+        dataset_statistics[le_modality] = {
+            "mean": np.mean(np_data, axis=0).tolist(),
+            "std": np.std(np_data, axis=0).tolist(),
+            "min": np.min(np_data, axis=0).tolist(),
+            "max": np.max(np_data, axis=0).tolist(),
+            "q01": np.quantile(np_data, 0.01, axis=0).tolist(),
+            "q99": np.quantile(np_data, 0.99, axis=0).tolist(),
+        }
+    return dataset_statistics
+
+
+class ModalityConfig(BaseModel):
+    """Configuration for a modality."""
+
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+
+
+class LeRobotSingleDataset(Dataset):
+    """
+    Base dataset class for LeRobot that supports sharding.
+    """
+    def __init__(
+        self,
+        dataset_path: Path | str,
+        modality_configs: dict[str, ModalityConfig],
+        embodiment_tag: str | EmbodimentTag,
+        video_backend: str = "decord",
+        video_backend_kwargs: dict | None = None,
+        transforms: ComposedModalityTransform | None = None,
+        delete_pause_frame: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset.
+
+        Args:
+            dataset_path (Path | str): The path to the dataset.
+            modality_configs (dict[str, ModalityConfig]): The configuration for each modality. The keys are the modality names, and the values are the modality configurations.
+                See `ModalityConfig` for more details.
+            video_backend (str): Backend for video reading.
+            video_backend_kwargs (dict): Keyword arguments for the video backend when initializing the video reader.
+            transforms (ComposedModalityTransform): The transforms to apply to the dataset.
+            embodiment_tag (EmbodimentTag): Overload the embodiment tag for the dataset. e.g. define it as "new_embodiment"
+        """
+        # first check if the path directory exists
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset path {dataset_path} does not exist")
+        data_cfg = kwargs.get("data_cfg", {}) or {}
+        # indict letobot version
+        self._lerobot_version = data_cfg.get("lerobot_version", "v2.0") #self._indict_lerobot_version(**kwargs)
+        self.load_video = data_cfg.get("load_video", True)
+
+        self.delete_pause_frame = delete_pause_frame
+
+        # If video loading is disabled, skip video modality end-to-end.
+        if self.load_video:
+            self.modality_configs = modality_configs
+        else:
+            self.modality_configs = {
+                modality: config
+                for modality, config in modality_configs.items()
+                if modality != "video"
+            }
+        self.video_backend = video_backend
+        self.video_backend_kwargs = video_backend_kwargs if video_backend_kwargs is not None else {}
+        self.transforms = (
+            transforms if transforms is not None else ComposedModalityTransform(transforms=[])
+        )
+
+        self._dataset_path = Path(dataset_path)
+        self._dataset_name = self._dataset_path.name
+        self._dataset_id = DATASET_NAME_TO_ID.get(self._dataset_name)
+        if isinstance(embodiment_tag, EmbodimentTag):
+            self.tag = embodiment_tag.value
+        else:
+            self.tag = embodiment_tag
+
+        self._metadata = self._get_metadata(EmbodimentTag(self.tag))
+
+        # LeRobot-specific config
+        self._lerobot_modality_meta = self._get_lerobot_modality_meta()
+        self._lerobot_info_meta = self._get_lerobot_info_meta()
+        self._data_path_pattern = self._get_data_path_pattern()
+        self._video_path_pattern = self._get_video_path_pattern()
+        self._chunk_size = self._get_chunk_size()
+        self._tasks = self._get_tasks()
+        self.curr_traj_data = None
+        self.curr_traj_id = None
+
+        self._trajectory_ids, self._trajectory_lengths = self._get_trajectories()
+        self._modality_keys = self._get_modality_keys()
+        self._delta_indices = self._get_delta_indices()
+        self._all_steps = self._get_all_steps()
+        self.set_transforms_metadata(self.metadata)
+        self.set_epoch(0)
+
+        print(f"Initialized dataset {self.dataset_name} with {embodiment_tag}")
+
+
+        # Check if the dataset is valid
+        self._check_integrity()
+
+    @property
+    def dataset_path(self) -> Path:
+        """The path to the dataset that contains the METADATA_FILENAME file."""
+        return self._dataset_path
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """The metadata for the dataset, loaded from metadata.json in the dataset directory"""
+        return self._metadata
+
+    @property
+    def trajectory_ids(self) -> np.ndarray:
+        """The trajectory IDs in the dataset, stored as a 1D numpy array of strings."""
+        return self._trajectory_ids
+
+    @property
+    def trajectory_lengths(self) -> np.ndarray:
+        """The trajectory lengths in the dataset, stored as a 1D numpy array of integers.
+        The order of the lengths is the same as the order of the trajectory IDs.
+        """
+        return self._trajectory_lengths
+
+    @property
+    def all_steps(self) -> list[tuple[int, int]]:
+        """The trajectory IDs and base indices for all steps in the dataset.
+        Example:
+            self.trajectory_ids: [0, 1, 2]
+            self.trajectory_lengths: [3, 2, 4]
+            return: [
+                ("traj_0", 0), ("traj_0", 1), ("traj_0", 2),
+                ("traj_1", 0), ("traj_1", 1),
+                ("traj_2", 0), ("traj_2", 1), ("traj_2", 2), ("traj_2", 3)
+            ]
+        """
+        return self._all_steps
+
+    @property
+    def modality_keys(self) -> dict:
+        """The modality keys for the dataset. The keys are the modality names, and the values are the keys for each modality.
+
+        Example: {
+            "video": ["video.image_side_0", "video.image_side_1"],
+            "state": ["state.eef_position", "state.eef_rotation"],
+            "action": ["action.eef_position", "action.eef_rotation"],
+            "language": ["language.human.task"],
+            "timestamp": ["timestamp"],
+            "reward": ["reward"],
+        }
+        """
+        return self._modality_keys
+
+    @property
+    def delta_indices(self) -> dict[str, np.ndarray]:
+        """The delta indices for the dataset. The keys are the modality.key, and the values are the delta indices for each modality.key."""
+        return self._delta_indices
+
+    @property
+    def dataset_name(self) -> str:
+        """The name of the dataset."""
+        return self._dataset_name
+
+    @property
+    def lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_modality_meta
+
+    @property
+    def lerobot_info_meta(self) -> dict:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_info_meta
+
+    @property
+    def data_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._data_path_pattern
+
+    @property
+    def video_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._video_path_pattern
+
+    @property
+    def chunk_size(self) -> int:
+        """The chunk size for the LeRobot dataset."""
+        return self._chunk_size
+
+    @property
+    def tasks(self) -> pd.DataFrame:
+        """The tasks for the dataset."""
+        return self._tasks
+
+    def _get_metadata(self, embodiment_tag: EmbodimentTag) -> DatasetMetadata:
+        """Get the metadata for the dataset.
+
+        Returns:
+            dict: The metadata for the dataset.
+        """
+
+        # 1. Modality metadata
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        # 1.1. State and action modalities
+        simplified_modality_meta: dict[str, dict] = {}
+        with open(modality_meta_path, "r") as f:
+            le_modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        for modality in ["state", "action"]:
+            simplified_modality_meta[modality] = {}
+            le_state_action_meta: dict[str, LeRobotStateActionMetadata] = getattr(
+                le_modality_meta, modality
+            )
+            for subkey in le_state_action_meta:
+                state_action_dtype = np.dtype(le_state_action_meta[subkey].dtype)
+                if np.issubdtype(state_action_dtype, np.floating):
+                    continuous = True
+                else:
+                    continuous = False
+                simplified_modality_meta[modality][subkey] = {
+                    "absolute": le_state_action_meta[subkey].absolute,
+                    "rotation_type": le_state_action_meta[subkey].rotation_type,
+                    "shape": [
+                        le_state_action_meta[subkey].end - le_state_action_meta[subkey].start
+                    ],
+                    "continuous": continuous,
+                }
+
+        # 1.2. Video modalities
+        le_info_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        assert (
+            le_info_path.exists()
+        ), f"Please provide a {LE_ROBOT_INFO_FILENAME} file in {self.dataset_path}"
+        with open(le_info_path, "r") as f:
+            le_info = json.load(f)
+        simplified_modality_meta["video"] = {}
+        for new_key in le_modality_meta.video:
+            original_key = le_modality_meta.video[new_key].original_key
+            if original_key is None:
+                original_key = new_key
+            le_video_meta = le_info["features"][original_key]
+            height = le_video_meta["shape"][le_video_meta["names"].index("height")]
+            width = le_video_meta["shape"][le_video_meta["names"].index("width")]
+            # NOTE(FH): different lerobot dataset versions have different keys for the number of channels and fps
+            try:
+                channels = le_video_meta["shape"][le_video_meta["names"].index("channel")]
+                fps = le_video_meta["video_info"]["video.fps"]
+            except (ValueError, KeyError):
+                # channels = le_video_meta["shape"][le_video_meta["names"].index("channels")]
+                channels = le_video_meta["info"]["video.channels"]
+                fps = le_video_meta["info"]["video.fps"]
+            simplified_modality_meta["video"][new_key] = {
+                "resolution": [width, height],
+                "channels": channels,
+                "fps": fps,
+            }
+
+        # 2. Dataset statistics
+        stats_path = self.dataset_path / LE_ROBOT_STATS_FILENAME
+        try:
+            with open(stats_path, "r") as f:
+                le_statistics = json.load(f)
+            for stat in le_statistics.values():
+                DatasetStatisticalValues.model_validate(stat)
+        except (FileNotFoundError, ValidationError) as e:
+            print(f"Failed to load dataset statistics: {e}")
+            print(f"Calculating dataset statistics for {self.dataset_name}")
+            # Get all parquet files in the dataset paths
+            parquet_files = list((self.dataset_path).glob(LE_ROBOT_DATA_FILENAME))
+            parquet_files_filtered = []
+            #  parquet_files[0].name = "episode_033675.parquet" is broken file
+            for pf in parquet_files:
+                if "episode_033675.parquet" in pf.name:
+                    continue
+                parquet_files_filtered.append(pf)
+            
+            le_statistics = calculate_dataset_statistics(parquet_files_filtered)
+            with open(stats_path, "w") as f:
+                json.dump(le_statistics, f, indent=4)
+        dataset_statistics = {}
+        for our_modality in ["state", "action"]:
+            dataset_statistics[our_modality] = {}
+            for subkey in simplified_modality_meta[our_modality]:
+                dataset_statistics[our_modality][subkey] = {}
+                state_action_meta = le_modality_meta.get_key_meta(f"{our_modality}.{subkey}")
+                assert isinstance(state_action_meta, LeRobotStateActionMetadata)
+                le_modality = state_action_meta.original_key
+                for stat_name in le_statistics[le_modality]:
+                    indices = np.arange(
+                        state_action_meta.start,
+                        state_action_meta.end,
+                    )
+                    stat = np.array(le_statistics[le_modality][stat_name])
+                    dataset_statistics[our_modality][subkey][stat_name] = stat[indices].tolist()
+
+        # 3. Full dataset metadata
+        metadata = DatasetMetadata(
+            statistics=dataset_statistics,  # type: ignore
+            modalities=simplified_modality_meta,  # type: ignore
+            embodiment_tag=embodiment_tag,
+        )
+
+        return metadata
+
+    def _get_trajectories(self) -> tuple[np.ndarray, np.ndarray]:
+        """Get the trajectories in the dataset."""
+        # Get trajectory lengths, IDs, and whitelist from dataset metadata
+        # v2.0
+        if self._lerobot_version == "v2.0":
+            file_path = self.dataset_path / LE_ROBOT_EPISODE_FILENAME
+            with open(file_path, "r") as f:
+                episode_metadata = [json.loads(line) for line in f]
+            trajectory_ids = []
+            trajectory_lengths = []
+            for episode in episode_metadata:
+                trajectory_ids.append(episode["episode_index"])
+                trajectory_lengths.append(episode["length"])
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+        # v3.0
+        elif self._lerobot_version == "v3.0":
+            file_paths = list((self.dataset_path).glob(LE_ROBOT3_EPISODE_FILENAME))
+            trajectory_ids = []
+            trajectory_lengths = []
+            # data_chunck_index = []
+            # data_file_index = []
+            # vido_from_index = []
+            self.trajectory_ids_to_metadata = {}
+            for file_path in file_paths:
+                episodes_data = pd.read_parquet(file_path)
+                for index, episode in episodes_data.iterrows():
+                    trajectory_ids.append(episode["episode_index"])
+                    trajectory_lengths.append(episode["length"])
+
+                    # TODO auto map key? just map to file_path and file_from_index
+                    episode_meta = {
+                        "data/chunk_index": episode["data/chunk_index"],
+                        "data/file_index": episode["data/file_index"],
+                        "data/file_from_index": index,
+                    }
+                    if self.load_video:
+                        episode_meta["videos/observation.images.wrist/from_timestamp"] = episode[
+                            "videos/observation.images.wrist/from_timestamp"
+                        ]
+                    self.trajectory_ids_to_metadata[trajectory_ids[-1]] = episode_meta
+
+            # 这里应该可以直接读取到 save index 信息
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+
+    def _get_all_steps(self) -> list[tuple[int, int]]:
+        """Get the trajectory IDs and base indices for all steps in the dataset.
+
+        Returns:
+            list[tuple[str, int]]: A list of (trajectory_id, base_index) tuples.
+        """
+        # Create a hash key based on configuration to ensure cache validity
+        config_key = self._get_steps_config_key()
+        
+        # Create a unique filename based on config_key
+        # steps_filename = f"steps_{config_key}.pkl"
+        # @BUG
+        # fast get static steps @fangjing --> don't use hash to dynamic sample
+        steps_filename =  "steps_data_index.pkl"
+
+
+        steps_path = self.dataset_path / "meta" / steps_filename
+        
+        # Try to load cached steps first
+        try:
+            if steps_path.exists():
+                with open(steps_path, "rb") as f:
+                    cached_data = pickle.load(f)
+                return cached_data["steps"]
+
+        except (FileNotFoundError, pickle.PickleError, KeyError) as e:
+            print(f"Failed to load cached steps: {e}")
+            print("Computing steps from scratch...")
+
+        # Compute steps using single process
+        all_steps = self._get_all_steps_single_process()
+        
+        # Cache the computed steps with unique filename
+        try:
+            cache_data = {
+                "config_key": config_key,
+                "steps": all_steps,
+                "num_trajectories": len(self.trajectory_ids),
+                "total_steps": len(all_steps),
+                "computed_timestamp": pd.Timestamp.now().isoformat(),
+                "delete_pause_frame": self.delete_pause_frame,
+            }
+            
+            # Ensure the meta directory exists
+            steps_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(steps_path, "wb") as f:
+                pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
+            print(f"Cached steps saved to {steps_path}")
+        except Exception as e:
+            print(f"Failed to cache steps: {e}")
+        
+        return all_steps
+
+    def _get_steps_config_key(self) -> str:
+        """Generate a configuration key for steps caching."""
+        config_dict = {
+            "delete_pause_frame": self.delete_pause_frame,
+            "dataset_name": self.dataset_name,
+        }
+        # Create a hash of the configuration
+        config_str = str(sorted(config_dict.items()))
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]  #
+
+
+    def _get_all_steps_single_process(self) -> list[tuple[int, int]]:
+        """Original single-process implementation as fallback."""
+        all_steps: list[tuple[int, int]] = []
+        skipped_trajectories = 0
+        processed_trajectories = 0
+        
+        # Check if language modality is configured
+        has_language_modality = 'language' in self.modality_keys and len(self.modality_keys['language']) > 0
+        # TODO why trajectory_length here, why not use data length?
+        for trajectory_id, trajectory_length in tqdm(zip(self.trajectory_ids, self.trajectory_lengths), total=len(self.trajectory_ids), desc="Getting All Step"):
+            try:
+                if self._lerobot_version == "v2.0":
+                    data = self.get_trajectory_data(trajectory_id)
+                elif self._lerobot_version == "v3.0":
+                    data = self.get_trajectory_data_lerobot_v3(trajectory_id)
+                
+                trajectory_skipped = False
+            
+                # Check if trajectory has valid language instruction (if language modality is configured)
+                if has_language_modality:
+                    self.curr_traj_data = data  # Set current trajectory data for get_language to work
+
+                    language_instruction = self.get_language(trajectory_id, self.modality_keys['language'][0], 0)
+                    if not language_instruction or language_instruction[0] == "":
+                        print(f"Skipping trajectory {trajectory_id} due to empty language instruction")
+                        skipped_trajectories += 1
+                        trajectory_skipped = True
+                        continue
+
+            except Exception as e:
+                print(f"Skipping trajectory {trajectory_id} due to read error: {e}")
+                skipped_trajectories += 1
+                trajectory_skipped = True
+                continue
+        
+            if not trajectory_skipped:
+                processed_trajectories += 1
+        
+            for base_index in range(trajectory_length):
+                all_steps.append((trajectory_id, base_index))
+                
+        # Print summary statistics
+        print(f"Single-process summary: Processed {processed_trajectories} trajectories, skipped {skipped_trajectories} empty trajectories")
+        print(f"Total steps: {len(all_steps)} from {len(self.trajectory_ids)} trajectories")
+                   
+        return all_steps
+
+    def _get_position_and_gripper_values(self, data: pd.DataFrame) -> tuple[list, list]:
+        """Get position and gripper values based on available columns in the dataset."""
+        # Get action keys from modality_keys
+        action_keys = self.modality_keys.get('action', [])
+        
+        # Extract position data
+        delta_position_values = None
+        position_candidates = ['delta_eef_position']
+        coordinate_candidates = ['x', 'y', 'z']
+        
+        # First try combined position fields
+        for pos_key in position_candidates:
+            full_key = f"action.{pos_key}"
+            if full_key in action_keys:
+                try:
+                    # Get the lerobot key for this modality
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    subkey = pos_key
+                    if subkey in le_action_cfg:
+                        le_key = le_action_cfg[subkey].original_key or subkey
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[subkey].start, le_action_cfg[subkey].end)
+                            filtered_data = data_array[:, le_indices]
+                            delta_position_values = filtered_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        # If combined fields not found, try individual x,y,z coordinates
+        if delta_position_values is None:
+            x_data, y_data, z_data = None, None, None
+            for coord in coordinate_candidates:
+                full_key = f"action.{coord}"
+                if full_key in action_keys:
+                    try:
+                        le_action_cfg = self.lerobot_modality_meta.action
+                        if coord in le_action_cfg:
+                            le_key = le_action_cfg[coord].original_key or coord
+                            if le_key in data.columns:
+                                data_array = np.stack(data[le_key])
+                                le_indices = np.arange(le_action_cfg[coord].start, le_action_cfg[coord].end)
+                                coord_data = data_array[:, le_indices].flatten()
+                                if coord == 'x':
+                                    x_data = coord_data
+                                elif coord == 'y':
+                                    y_data = coord_data
+                                elif coord == 'z':
+                                    z_data = coord_data
+                    except Exception:
+                        continue
+            
+            if x_data is not None and y_data is not None and z_data is not None:
+                delta_position_values = np.column_stack((x_data, y_data, z_data)).tolist()
+        
+        if delta_position_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.delta_eef_position' in data.columns:
+                delta_position_values = data['action.delta_eef_position'].to_numpy().tolist()
+            elif all(col in data.columns for col in ['action.x', 'action.y', 'action.z']):
+                x_vals = data['action.x'].to_numpy()
+                y_vals = data['action.y'].to_numpy() 
+                z_vals = data['action.z'].to_numpy()
+                delta_position_values = np.column_stack((x_vals, y_vals, z_vals)).tolist()
+            else:
+                raise ValueError(f"No suitable position columns found. Available columns: {data.columns.tolist()}")
+        
+        # Extract gripper data
+        gripper_values = None
+        gripper_candidates = ['gripper_close', 'gripper']
+        
+        for grip_key in gripper_candidates:
+            full_key = f"action.{grip_key}"
+            if full_key in action_keys:
+                try:
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    if grip_key in le_action_cfg:
+                        le_key = le_action_cfg[grip_key].original_key or grip_key
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[grip_key].start, le_action_cfg[grip_key].end)
+                            gripper_data = data_array[:, le_indices].flatten()
+                            gripper_values = gripper_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        if gripper_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.gripper_close' in data.columns:
+                gripper_values = data['action.gripper_close'].to_numpy().tolist()
+            elif 'action.gripper' in data.columns:
+                gripper_values = data['action.gripper'].to_numpy().tolist()
+            else:
+                raise ValueError(f"No suitable gripper columns found. Available columns: {data.columns.tolist()}")
+        
+        return delta_position_values, gripper_values
+
+    def _get_modality_keys(self) -> dict:
+        """Get the modality keys for the dataset.
+        The keys are the modality names, and the values are the keys for each modality.
+        See property `modality_keys` for the expected format.
+        """
+        modality_keys = defaultdict(list)
+        for modality, config in self.modality_configs.items():
+            modality_keys[modality] = config.modality_keys
+        return modality_keys
+
+    def _get_delta_indices(self) -> dict[str, np.ndarray]:
+        """Restructure the delta indices to use modality.key as keys instead of just the modalities."""
+        delta_indices: dict[str, np.ndarray] = {}
+        for config in self.modality_configs.values():
+            for key in config.modality_keys:
+                delta_indices[key] = np.array(config.delta_indices)
+        return delta_indices
+
+    def _get_lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """Get the metadata for the LeRobot dataset."""
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        with open(modality_meta_path, "r") as f:
+            modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        return modality_meta
+
+    def _get_lerobot_info_meta(self) -> dict:
+        """Get the metadata for the LeRobot dataset."""
+        info_meta_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        with open(info_meta_path, "r") as f:
+            info_meta = json.load(f)
+        return info_meta
+
+    def _get_data_path_pattern(self) -> str:
+        """Get the data path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["data_path"]
+
+    def _get_video_path_pattern(self) -> str:
+        """Get the video path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["video_path"]
+
+    def _get_chunk_size(self) -> int:
+        """Get the chunk size for the LeRobot dataset."""
+        return self.lerobot_info_meta["chunks_size"]
+
+    def _get_tasks(self) -> pd.DataFrame:
+        """Get the tasks for the dataset."""
+        if self._lerobot_version == "v2.0":
+            tasks_path = self.dataset_path / LE_ROBOT_TASKS_FILENAME
+            with open(tasks_path, "r") as f:
+                tasks = [json.loads(line) for line in f]
+            df = pd.DataFrame(tasks)
+            return df.set_index("task_index")
+        
+        elif self._lerobot_version == "v3.0":
+            tasks_path = self.dataset_path / LE_ROBOT3_TASKS_FILENAME
+            df = pd.read_parquet(tasks_path)
+            df = df.reset_index()  # 把索引变成一列，列名通常为 'index'
+            df = df.rename(columns={'index': 'task'})  # 把 'index' 列重命名为 'task'
+            df = df[['task_index', 'task']]  # 调整列顺序
+            return df
+    def _check_integrity(self):
+        """Use the config to check if the keys are valid and detect silent data corruption."""
+        ERROR_MSG_HEADER = f"Error occurred in initializing dataset {self.dataset_name}:\n"
+
+        for modality_config in self.modality_configs.values():
+            for key in modality_config.modality_keys:
+                if key == "lapa_action" or key == "dream_actions":
+                    continue  # no need for any metadata for lapa actions because it comes normalized
+                # Check if the key is valid
+                try:
+                    self.lerobot_modality_meta.get_key_meta(key)
+                except Exception as e:
+                    raise ValueError(
+                        ERROR_MSG_HEADER + f"Unable to find key {key} in modality metadata:\n{e}"
+                    )
+
+    def set_transforms_metadata(self, metadata: DatasetMetadata):
+        """Set the metadata for the transforms. This is useful for transforms that need to know the metadata, such as the normalization values."""
+        self.transforms.set_metadata(metadata)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+
+    def __len__(self) -> int:
+        """Get the total number of data points in the dataset.
+
+        Returns:
+            int: the total number of data points in the dataset.
+        """
+        return len(self.all_steps)
+
+    def __str__(self) -> str:
+        """Get the description of the dataset."""
+        return f"{self.dataset_name} ({len(self)} steps)"
+
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single step in a trajectory.
+
+        Args:
+            index (int): The index of the step to get.
+
+        Returns:
+            dict: The data for the step.
+        """
+        trajectory_id, base_index = self.all_steps[index]
+        data = self.get_step_data(trajectory_id, base_index)
+        
+        # Process all video keys dynamically
+        images = []
+        for video_key in self.modality_keys.get("video", []):
+            image = data[video_key][0]
+
+            image = Image.fromarray(image).resize((224, 224))
+            images.append(image)
+        
+        # Get language and action data
+        language = data[self.modality_keys["language"][0]][0]
+        action = []
+        for action_key in self.modality_keys["action"]:
+            action.append(data[action_key])
+        action = np.concatenate(action, axis=1)
+        action = standardize_action_representation(action, self.tag)
+
+        state = []
+        for state_key in self.modality_keys["state"]:
+            state.append(data[state_key])
+        state = np.concatenate(state, axis=1)
+        state = standardize_state_representation(state, self.tag)
+        
+        return dict(action=action, state=state, image=images, language=language, dataset_id=self._dataset_id)
+
+    def get_step_data(self, trajectory_id: int, base_index: int) -> dict:
+        """Get the RAW data for a single step in a trajectory. No transforms are applied.
+
+        Args:
+            trajectory_id (int): The name of the trajectory.
+            base_index (int): The base step index in the trajectory.
+
+        Returns:
+            dict: The RAW data for the step.
+
+        Example return:
+            {
+                "video": {
+                    "video.image_side_0": [B, T, H, W, C],
+                    "video.image_side_1": [B, T, H, W, C],
+                },
+                "state": {
+                    "state.eef_position": [B, T, state_dim],
+                    "state.eef_rotation": [B, T, state_dim],
+                },
+                "action": {
+                    "action.eef_position": [B, T, action_dim],
+                    "action.eef_rotation": [B, T, action_dim],
+                },
+            }
+        """
+        data = {}
+        # Get the data for all modalities # just for action base data
+        self.curr_traj_data = self.get_trajectory_data(trajectory_id)
+        # TODO @JinhuiYE The logic below is poorly implemented. Data reading should be directly based on curr_traj_data.
+        for modality in self.modality_keys:
+            # Get the data corresponding to each key in the modality
+            for key in self.modality_keys[modality]:
+                data[key] = self.get_data_by_modality(trajectory_id, modality, key, base_index)
+        return data
+
+    def get_trajectory_data(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory."""
+        if self._lerobot_version == "v2.0":
+        
+            if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+                return self.curr_traj_data
+            else:
+                chunk_index = self.get_episode_chunk(trajectory_id)
+                parquet_path = self.dataset_path / self.data_path_pattern.format(
+                    episode_chunk=chunk_index, episode_index=trajectory_id
+                )
+                assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+                return pd.read_parquet(parquet_path)
+        elif self._lerobot_version == "v3.0":
+            return self.get_trajectory_data_lerobot_v3(trajectory_id)
+    
+    def get_trajectory_data_lerobot_v3(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory from lerobot v3."""
+        if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+            return self.curr_traj_data
+        else: #TODO check detail later
+            chunk_index = self.get_episode_chunk(trajectory_id)
+
+            file_index = self.get_episode_file_index(trajectory_id)
+            # file_from_index = self.get_episode_file_from_index(trajectory_id)
+            
+            
+            parquet_path = self.dataset_path / self.data_path_pattern.format(
+                chunk_index=chunk_index, file_index=file_index
+            )
+            assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+            file_data = pd.read_parquet(parquet_path)
+            
+            # filter by trajectory_id
+            episode_data = file_data.loc[file_data["episode_index"] == trajectory_id].copy()
+            
+            # fix timestamp from epis index to file index for video alignment
+            if self.load_video:
+                from_timestamp = self.trajectory_ids_to_metadata[trajectory_id].get(
+                    "videos/observation.images.wrist/from_timestamp", 0
+                )
+                episode_data["timestamp"] = episode_data["timestamp"] + from_timestamp
+            
+            return episode_data
+
+
+    def get_trajectory_index(self, trajectory_id: int) -> int:
+        """Get the index of the trajectory in the dataset by the trajectory ID.
+        This is useful when you need to get the trajectory length or sampling weight corresponding to the trajectory ID.
+
+        Args:
+            trajectory_id (str): The ID of the trajectory.
+
+        Returns:
+            int: The index of the trajectory in the dataset.
+        """
+        trajectory_indices = np.where(self.trajectory_ids == trajectory_id)[0]
+        if len(trajectory_indices) != 1:
+            raise ValueError(
+                f"Error finding trajectory index for {trajectory_id}, found {trajectory_indices=}"
+            )
+        return trajectory_indices[0]
+
+    def get_episode_chunk(self, ep_index: int) -> int:
+        """Get the chunk index for an episode index."""
+        return ep_index // self.chunk_size
+    def get_episode_file_index(self, ep_index: int) -> int:
+        """Get the file index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_index"]
+    
+    def get_episode_file_from_index(self, ep_index: int) -> int:
+        """Get the file from index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_from_index"]
+
+
+    def retrieve_data_and_pad(
+        self,
+        array: np.ndarray,
+        step_indices: np.ndarray,
+        max_length: int,
+        padding_strategy: str = "first_last",
+    ) -> np.ndarray:
+        """Retrieve the data from the dataset and pad it if necessary.
+        Args:
+            array (np.ndarray): The array to retrieve the data from.
+            step_indices (np.ndarray): The step indices to retrieve the data for.
+            max_length (int): The maximum length of the data.
+            padding_strategy (str): The padding strategy, either "first" or "last".
+        """
+        # Get the padding indices
+        front_padding_indices = step_indices < 0
+        end_padding_indices = step_indices >= max_length
+        padding_positions = np.logical_or(front_padding_indices, end_padding_indices)
+        # Retrieve the data with the non-padding indices
+        # If there exists some padding, Given T step_indices, the shape of the retrieved data will be (T', ...) where T' < T
+        raw_data = array[step_indices[~padding_positions]]
+        assert isinstance(raw_data, np.ndarray), f"{type(raw_data)=}"
+        # This is the shape of the output, (T, ...)
+        if raw_data.ndim == 1:
+            expected_shape = (len(step_indices),)
+        else:
+            expected_shape = (len(step_indices), *array.shape[1:])
+
+        # Pad the data
+        output = np.zeros(expected_shape)
+        # Assign the non-padded data
+        output[~padding_positions] = raw_data
+        # If there exists some padding, pad the data
+        if padding_positions.any():
+            if padding_strategy == "first_last":
+                # Use first / last step data to pad
+                front_padding_data = array[0]
+                end_padding_data = array[-1]
+                output[front_padding_indices] = front_padding_data
+                output[end_padding_indices] = end_padding_data
+            elif padding_strategy == "zero":
+                # Use zero padding
+                output[padding_positions] = 0
+            else:
+                raise ValueError(f"Invalid padding strategy: {padding_strategy}")
+        return output
+
+    def get_video_path(self, trajectory_id: int, key: str) -> Path:
+        chunk_index = self.get_episode_chunk(trajectory_id)
+        original_key = self.lerobot_modality_meta.video[key].original_key
+        if original_key is None:
+            original_key = key
+        if self._lerobot_version == "v2.0":
+            video_filename = self.video_path_pattern.format(
+                episode_chunk=chunk_index, episode_index=trajectory_id, video_key=original_key
+            )
+        elif self._lerobot_version == "v3.0":
+            episode_meta = self.trajectory_ids_to_metadata[trajectory_id]
+            video_filename = self.video_path_pattern.format(
+                video_key=original_key,
+                chunk_index=episode_meta["data/chunk_index"],
+                file_index=episode_meta["data/file_index"],
+            )
+        return self.dataset_path / video_filename
+
+    def get_video(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the video frames for a trajectory by a base index.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (str): The ID of the trajectory.
+            key (str): The key of the video.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The video frames for the trajectory and frame indices. Shape: (T, H, W, C)
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # print(f"{step_indices=}")
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Ensure the indices are within the valid range
+        # This is equivalent to padding the video with extra frames at the beginning and end
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, self.trajectory_lengths[trajectory_index] - 1)
+        assert key.startswith("video."), f"Video key must start with 'video.', got {key}"
+        # Get the sub-key
+        key = key.replace("video.", "")
+        video_path = self.get_video_path(trajectory_id, key)
+        # Get the action/state timestamps for each frame in the video
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert "timestamp" in self.curr_traj_data.columns, f"No timestamp found in {trajectory_id=}"
+        timestamp: np.ndarray = self.curr_traj_data["timestamp"].to_numpy()
+        # Get the corresponding video timestamps from the step indices
+        video_timestamp = timestamp[step_indices]
+
+        return get_frames_by_timestamps(
+            video_path.as_posix(),
+            video_timestamp,
+            video_backend=self.video_backend, # TODO
+            video_backend_kwargs=self.video_backend_kwargs,
+        )
+
+    def get_state_or_action(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the state or action data for a trajectory by a base index.
+        If the step indices are out of range, pad with the data:
+            if the data is stored in absolute format, pad with the first or last step data;
+            otherwise, pad with zero.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The data for the trajectory and step indices.
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        assert key.startswith(modality + "."), f"{key} must start with {modality + '.'}, got {key}"
+        # Get the sub-key, e.g. state.joint_angles -> joint_angles
+        key = key.replace(modality + ".", "")
+        # Get the lerobot key
+        le_state_or_action_cfg = getattr(self.lerobot_modality_meta, modality)
+        le_key = le_state_or_action_cfg[key].original_key
+        if le_key is None:
+            le_key = key
+        # Get the data array, shape: (T, D)
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert le_key in self.curr_traj_data.columns, f"No {le_key} found in {trajectory_id=}"
+        data_array: np.ndarray = np.stack(self.curr_traj_data[le_key])  # type: ignore
+        assert data_array.ndim == 2, f"Expected 2D array, got key {le_key} is{data_array.shape} array"
+        le_indices = np.arange(
+            le_state_or_action_cfg[key].start,
+            le_state_or_action_cfg[key].end,
+        )
+        data_array = data_array[:, le_indices]
+        # Get the state or action configuration
+        state_or_action_cfg = getattr(self.metadata.modalities, modality)[key]
+
+        # Pad the data
+        return self.retrieve_data_and_pad(
+            array=data_array,
+            step_indices=step_indices,
+            max_length=max_length,
+            padding_strategy="first_last" if state_or_action_cfg.absolute else "zero",
+            # padding_strategy="zero",           # HACK for realdata
+        )
+
+    def get_language(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> list[str]:
+        """Get the language annotation data for a trajectory by step indices.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            key (str): The key of the annotation.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            list[str]: The annotation data for the trajectory and step indices. If no matching data is found, return empty strings.
+        """
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        # Get the end times corresponding to the closest indices
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, max_length - 1)
+        # Get the annotations
+        task_indices: list[int] = []
+        assert key.startswith(
+            "annotation."
+        ), f"Language key must start with 'annotation.', got {key}"
+        subkey = key.replace("annotation.", "")
+        annotation_meta = self.lerobot_modality_meta.annotation
+        assert annotation_meta is not None, f"Annotation metadata is None for {subkey}"
+        assert (
+            subkey in annotation_meta
+        ), f"Annotation key {subkey} not found in metadata, available annotation keys: {annotation_meta.keys()}"
+        subkey_meta = annotation_meta[subkey]
+        original_key = subkey_meta.original_key
+        if original_key is None:
+            original_key = key
+        for i in range(len(step_indices)): # 
+            # task_indices.append(self.curr_traj_data[original_key][step_indices[i]].item())
+            value = self.curr_traj_data[original_key].iloc[step_indices[i]] # TODO check v2.0 
+            task_indices.append(value if isinstance(value, (int, float)) else value.item())
+
+        return self.tasks.loc[task_indices]["task"].tolist()
+
+    def get_data_by_modality(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ):
+        """Get the data corresponding to the modality for a trajectory by a base index.
+        This method will call the corresponding helper method based on the modality.
+        See the helper methods for more details.
+        NOTE: For the language modality, the data is padded with empty strings if no matching data is found.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+        """
+        if modality == "video":
+            return self.get_video(trajectory_id, key, base_index)
+        elif modality == "state" or modality == "action":
+            return self.get_state_or_action(trajectory_id, modality, key, base_index)
+        elif modality == "language":
+            return self.get_language(trajectory_id, key, base_index)
+        else:
+            raise ValueError(f"Invalid modality: {modality}")
+
+    def _save_dataset_statistics_(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the dataset.
+        Key order follows modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Get used modality keys
+        used_action_keys, used_state_keys = get_used_modality_keys(self.modality_keys)
+        
+        # Organize statistics by tag
+        tag = self.tag
+        tag_stats = {}
+        
+        # Process action statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'action') and self.metadata.statistics.action:
+            action_stats = self.metadata.statistics.action
+            filtered_action_stats = {
+                key: action_stats[key]
+                for key in used_action_keys
+                if key in action_stats
+            }
+            
+            if filtered_action_stats:
+                # Combine statistics from filtered action sub-keys
+                combined_action_stats = combine_modality_stats(filtered_action_stats)
+                
+                # Add mask field based on whether it's gripper or not
+                mask = generate_action_mask_for_used_keys(
+                    self.metadata.modalities.action, filtered_action_stats.keys()
+                )
+                combined_action_stats["mask"] = mask
+                
+                tag_stats["action"] = combined_action_stats
+        
+        # Process state statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'state') and self.metadata.statistics.state:
+            state_stats = self.metadata.statistics.state
+            filtered_state_stats = {
+                key: state_stats[key]
+                for key in used_state_keys
+                if key in state_stats
+            }
+            
+            if filtered_state_stats:
+                combined_state_stats = combine_modality_stats(filtered_state_stats)
+                tag_stats["state"] = combined_state_stats
+        
+        # Add dataset counts
+        tag_stats["num_transitions"] = len(self)
+        tag_stats["num_trajectories"] = len(self.trajectory_ids)
+        
+        statistics_data[tag] = tag_stats
+        
+        # Save as JSON file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Single dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(used_action_keys)}")
+        print(f"Used state keys (reordered): {list(used_state_keys)}")
+
+
+
+class MixtureSpecElement(BaseModel):
+    dataset_path: list[Path] | Path = Field(..., description="The path to the dataset.")
+    dataset_weight: float = Field(..., description="The weight of the dataset in the mixture.")
+    distribute_weights: bool = Field(
+        default=False,
+        description="Whether to distribute the weights of the dataset across all the paths. If True, the weights will be evenly distributed across all the paths.",
+    )
+
+
+# Helper functions for dataset statistics
+
+def combine_modality_stats(modality_stats: dict) -> dict:
+    """
+    Combine statistics from all sub-keys under a modality.
+    
+    Args:
+        modality_stats (dict): Statistics for a modality, containing multiple sub-keys.
+                               Each sub-key contains DatasetStatisticalValues object.
+        
+    Returns:
+        dict: Combined statistics
+    """
+    combined_stats = {
+        "mean": [],
+        "std": [],
+        "max": [],
+        "min": [],
+        "q01": [],
+        "q99": []
+    }
+    
+    # Combine statistics in sub-key order
+    for subkey in modality_stats.keys():
+        subkey_stats = modality_stats[subkey]  # This is a DatasetStatisticalValues object
+        
+        # Convert DatasetStatisticalValues to dict-like access
+        for stat_name in ["mean", "std", "max", "min", "q01", "q99"]:
+            stat_value = getattr(subkey_stats, stat_name)
+            if isinstance(stat_value, (list, tuple)):
+                combined_stats[stat_name].extend(stat_value)
+            else:
+                # Handle NDArray case - convert to list
+                if hasattr(stat_value, 'tolist'):
+                    combined_stats[stat_name].extend(stat_value.tolist())
+                else:
+                    combined_stats[stat_name].append(float(stat_value))
+    
+    return combined_stats
+
+def generate_action_mask_for_used_keys(action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+    """
+    Generate mask based on action modalities, but only for used keys.
+    All dimensions are set to True so every channel is de/normalized.
+    
+    Args:
+        action_modalities (dict): Configuration information for action modalities.
+        used_action_keys_ordered: Iterable of actually used action keys in the correct order.
+        
+    Returns:
+        list[bool]: List of mask values
+    """
+    mask = []
+    
+    # Generate mask in the same order as the statistics were combined
+    for subkey in used_action_keys_ordered:
+        if subkey in action_modalities:
+            subkey_config = action_modalities[subkey]
+            
+            # Get dimension count from shape
+            if hasattr(subkey_config, 'shape') and len(subkey_config.shape) > 0:
+                dim_count = subkey_config.shape[0]
+            else:
+                dim_count = 1
+            
+            # Check if it's gripper-related
+            is_gripper = "gripper" in subkey.lower()
+            
+            # Generate mask value for each dimension
+            for _ in range(dim_count):
+                mask.append(not is_gripper)  # gripper is False, others are True
+    
+    return mask
+
+def get_used_modality_keys(modality_keys: dict) -> tuple[set, set]:
+    """Extract used action and state keys from modality configuration."""
+    used_action_keys = []
+    used_state_keys = []
+    
+    # Extract action keys (remove "action." prefix)
+    for action_key in modality_keys.get("action", []):
+        if action_key.startswith("action."):
+            clean_key = action_key.replace("action.", "")
+            used_action_keys.append(clean_key)
+    
+    # Extract state keys (remove "state." prefix)  
+    for state_key in modality_keys.get("state", []):
+        if state_key.startswith("state."):
+            clean_key = state_key.replace("state.", "")
+            used_state_keys.append(clean_key)
+    
+    return used_action_keys, used_state_keys
+
+
+def safe_hash(input_tuple):
+    # keep 128 bits of the hash
+    tuple_string = repr(input_tuple).encode("utf-8")
+    sha256 = hashlib.sha256()
+    sha256.update(tuple_string)
+
+    seed = int(sha256.hexdigest(), 16)
+
+    return seed & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+    
+
+class LeRobotMixtureDataset(Dataset):
+    """
+    A mixture of multiple datasets. This class samples a single dataset based on the dataset weights and then calls the `__getitem__` method of the sampled dataset.
+    It is recommended to modify the single dataset class instead of this class.
+    """
+
+    def __init__(
+        self,
+        data_mixture: Sequence[tuple[LeRobotSingleDataset, float]],
+        mode: str,
+        balance_dataset_weights: bool = True,
+        balance_trajectory_weights: bool = True,
+        seed: int = 42,
+        metadata_config: dict = {
+            "percentile_mixing_method": "min_max",
+        },
+        **kwargs,
+    ):
+        """
+        Initialize the mixture dataset.
+
+        Args:
+            data_mixture (list[tuple[LeRobotSingleDataset, float]]): Datasets and their corresponding weights.
+            mode (str): If "train", __getitem__ will return different samples every epoch; if "val" or "test", __getitem__ will return the same sample every epoch.
+            balance_dataset_weights (bool): If True, the weight of dataset will be multiplied by the total trajectory length of each dataset.
+            balance_trajectory_weights (bool): If True, sample trajectories within a dataset weighted by their length; otherwise, use equal weighting.
+            seed (int): Random seed for sampling.
+        """
+        datasets: list[LeRobotSingleDataset] = []
+        dataset_sampling_weights: list[float] = []
+        for dataset, weight in data_mixture:
+            # Check if dataset is valid and has data
+            if len(dataset) == 0:
+                print(f"Warning: Skipping empty dataset {dataset.dataset_name}")
+                continue
+            datasets.append(dataset)
+            dataset_sampling_weights.append(weight)
+        
+        if len(datasets) == 0:
+            raise ValueError("No valid datasets found in the mixture. All datasets are empty.")
+        
+        self.datasets = datasets
+        self.balance_dataset_weights = balance_dataset_weights
+        self.balance_trajectory_weights = balance_trajectory_weights
+        self.seed = seed
+        self.mode = mode
+
+        # Set properties for sampling
+
+        # 1. Dataset lengths
+        self._dataset_lengths = np.array([len(dataset) for dataset in self.datasets])
+        print(f"Dataset lengths: {self._dataset_lengths}")
+
+        # 2. Dataset sampling weights
+        self._dataset_sampling_weights = np.array(dataset_sampling_weights)
+        
+        if self.balance_dataset_weights:
+            self._dataset_sampling_weights *= self._dataset_lengths
+        
+        # Check for zero or negative weights before normalization
+        if np.any(self._dataset_sampling_weights <= 0):
+            print(f"Warning: Found zero or negative sampling weights: {self._dataset_sampling_weights}")
+            # Set minimum weight to prevent division issues
+            self._dataset_sampling_weights = np.maximum(self._dataset_sampling_weights, 1e-8)
+        
+        # Normalize weights
+        weights_sum = self._dataset_sampling_weights.sum()
+        if weights_sum == 0 or np.isnan(weights_sum):
+            print(f"Error: Invalid weights sum: {weights_sum}")
+            # Fallback to equal weights
+            self._dataset_sampling_weights = np.ones(len(self.datasets)) / len(self.datasets)
+            print(f"Fallback to equal weights")
+        else:
+            self._dataset_sampling_weights /= weights_sum
+
+        # 3. Trajectory sampling weights
+        self._trajectory_sampling_weights: list[np.ndarray] = []
+        for i, dataset in enumerate(self.datasets):
+            trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths))
+            if self.balance_trajectory_weights:
+                trajectory_sampling_weights *= dataset.trajectory_lengths
+            
+            # Check for zero or negative weights before normalization
+            if np.any(trajectory_sampling_weights <= 0):
+                print(f"Warning: Dataset {i} has zero or negative trajectory weights")
+                trajectory_sampling_weights = np.maximum(trajectory_sampling_weights, 1e-8)
+            
+            # Normalize weights
+            weights_sum = trajectory_sampling_weights.sum()
+            if weights_sum == 0 or np.isnan(weights_sum):
+                print(f"Error: Dataset {i} has invalid trajectory weights sum: {weights_sum}")
+                # Fallback to equal weights
+                trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths)) / len(dataset.trajectory_lengths)
+            else:
+                trajectory_sampling_weights /= weights_sum
+            
+            self._trajectory_sampling_weights.append(trajectory_sampling_weights)
+
+        # 4. Primary dataset indices
+        self._primary_dataset_indices = np.array(dataset_sampling_weights) == 1.0
+        if not np.any(self._primary_dataset_indices):
+            print(f"Warning: No dataset with weight 1.0 found. Original weights: {dataset_sampling_weights}")
+            # Fallback: use the dataset(s) with maximum weight as primary
+            max_weight = max(dataset_sampling_weights)
+            self._primary_dataset_indices = np.array(dataset_sampling_weights) == max_weight
+            print(f"Using datasets with maximum weight {max_weight} as primary: {self._primary_dataset_indices}")
+            
+        if not np.any(self._primary_dataset_indices):
+            # This should never happen, but just in case
+            print("Error: Still no primary dataset found. Using first dataset as primary.")
+            self._primary_dataset_indices = np.zeros(len(self.datasets), dtype=bool)
+            self._primary_dataset_indices[0] = True
+
+        # Set the epoch and sample the first epoch
+        self.set_epoch(0)
+
+        self.update_metadata(metadata_config)
+
+    @property
+    def dataset_lengths(self) -> np.ndarray:
+        """The lengths of each dataset."""
+        return self._dataset_lengths
+
+    @property
+    def dataset_sampling_weights(self) -> np.ndarray:
+        """The sampling weights for each dataset."""
+        return self._dataset_sampling_weights
+
+    @property
+    def trajectory_sampling_weights(self) -> list[np.ndarray]:
+        """The sampling weights for each trajectory in each dataset."""
+        return self._trajectory_sampling_weights
+
+    @property
+    def primary_dataset_indices(self) -> np.ndarray:
+        """The indices of the primary datasets."""
+        return self._primary_dataset_indices
+
+    def __str__(self) -> str:
+        dataset_descriptions = []
+        for dataset, weight in zip(self.datasets, self.dataset_sampling_weights):
+            dataset_description = {
+                "Dataset": str(dataset),
+                "Sampling weight": float(weight),
+            }
+            dataset_descriptions.append(dataset_description)
+        return json.dumps({"Mixture dataset": dataset_descriptions}, indent=2)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+        # self.sampled_steps = self.sample_epoch()
+
+    def sample_step(self, index: int) -> tuple[LeRobotSingleDataset, int, int]:
+        """Sample a single step from the dataset."""
+        # return self.sampled_steps[index]
+
+        # Set seed
+        seed = index if self.mode != "train" else safe_hash((self.epoch, index, self.seed))
+        rng = np.random.default_rng(seed)
+
+        # Sample dataset
+        dataset_index = rng.choice(len(self.datasets), p=self.dataset_sampling_weights)
+        dataset = self.datasets[dataset_index]
+
+        # Sample trajectory
+        # trajectory_index = rng.choice(
+        #     len(dataset.trajectory_ids), p=self.trajectory_sampling_weights[dataset_index]
+        # )
+        # trajectory_id = dataset.trajectory_ids[trajectory_index]
+
+        # # Sample step
+        # base_index = rng.choice(dataset.trajectory_lengths[trajectory_index])
+        # return dataset, trajectory_id, base_index
+        single_step_index = rng.choice(len(dataset.all_steps))
+        trajectory_id, base_index = dataset.all_steps[single_step_index]
+        return dataset, trajectory_id, base_index
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single trajectory and start index.
+
+        Args:
+            index (int): The index of the trajectory to get.
+
+        Returns:
+            dict: The data for the trajectory and start index.
+        """
+        max_retries = 10
+        last_exception = None
+        
+        for attempt in range(max_retries):
+            try:
+                dataset, trajectory_name, step = self.sample_step(index)
+                data_raw = dataset.get_step_data(trajectory_name, step)
+                data = dataset.transforms(data_raw)
+                
+                # Process all video keys dynamically
+                images = []
+                for video_key in dataset.modality_keys.get("video", []):
+                    image = data[video_key][0]
+                    
+                    image = Image.fromarray(image).resize((224, 224)) #TODO check if this is ok
+                    images.append(image)
+                
+                # Get language and action data
+                language = data[dataset.modality_keys["language"][0]][0]
+                action = []
+                for action_key in dataset.modality_keys["action"]:
+                    action.append(data[action_key])
+                action = np.concatenate(action, axis=1).astype(np.float16)
+                action = standardize_action_representation(action, dataset.tag)
+                
+                state = []
+                for state_key in dataset.modality_keys["state"]:
+                    state.append(data[state_key])
+                state = np.concatenate(state, axis=1).astype(np.float16)
+                state = standardize_state_representation(state, dataset.tag)
+                
+                return dict(action=action, state=state, image=images, lang=language, dataset_id=dataset._dataset_id)
+                
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    # Log the error but continue trying
+                    print(f"Attempt {attempt + 1}/{max_retries} failed for index {index}: {e}")
+                    print(f"Retrying with new sample...")
+                    # For retry, we can use a slightly different index to get a new sample
+                    # This helps avoid getting stuck on the same problematic sample
+                    index = random.randint(0, len(self) - 1)
+                else:
+                    # All retries exhausted
+                    print(f"All {max_retries} attempts failed for index {index}")
+                    print(f"Last error: {last_exception}")
+                    # Return a dummy sample or re-raise the exception
+                    raise last_exception
+
+    def __len__(self) -> int:
+        """Get the length of a single epoch in the mixture.
+
+        Returns:
+            int: The length of a single epoch in the mixture.
+        """
+        # Check for potential issues
+        if len(self.datasets) == 0:
+            return 0
+            
+        # Check if any dataset lengths are 0 or NaN
+        if np.any(self.dataset_lengths == 0) or np.any(np.isnan(self.dataset_lengths)):
+            print(f"Warning: Found zero or NaN dataset lengths: {self.dataset_lengths}")
+            # Filter out zero/NaN length datasets
+            valid_indices = (self.dataset_lengths > 0) & (~np.isnan(self.dataset_lengths))
+            if not np.any(valid_indices):
+                print("Error: All datasets have zero or NaN length")
+                return 0
+        else:
+            valid_indices = np.ones(len(self.datasets), dtype=bool)
+        
+        # Check if any sampling weights are 0 or NaN
+        if np.any(self.dataset_sampling_weights == 0) or np.any(np.isnan(self.dataset_sampling_weights)):
+            print(f"Warning: Found zero or NaN sampling weights: {self.dataset_sampling_weights}")
+            # Use only valid weights
+            valid_weights = (self.dataset_sampling_weights > 0) & (~np.isnan(self.dataset_sampling_weights))
+            valid_indices = valid_indices & valid_weights
+            if not np.any(valid_indices):
+                print("Error: All sampling weights are zero or NaN")
+                return 0
+        
+        # Check primary dataset indices
+        primary_and_valid = self.primary_dataset_indices & valid_indices
+        if not np.any(primary_and_valid):
+            print(f"Warning: No valid primary datasets found. Primary indices: {self.primary_dataset_indices}, Valid indices: {valid_indices}")
+            # Fallback: use the largest valid dataset
+            if np.any(valid_indices):
+                max_length = self.dataset_lengths[valid_indices].max()
+                print(f"Fallback: Using maximum dataset length: {max_length}")
+                return int(max_length)
+            else:
+                return 0
+        
+        # Calculate the ratio and get max
+        ratios = (self.dataset_lengths / self.dataset_sampling_weights)[primary_and_valid]
+        
+        # Check for NaN or inf in ratios
+        if np.any(np.isnan(ratios)) or np.any(np.isinf(ratios)):
+            print(f"Warning: Found NaN or inf in ratios: {ratios}")
+            print(f"Dataset lengths: {self.dataset_lengths[primary_and_valid]}")
+            print(f"Sampling weights: {self.dataset_sampling_weights[primary_and_valid]}")
+            # Filter out invalid ratios
+            valid_ratios = ratios[~np.isnan(ratios) & ~np.isinf(ratios)]
+            if len(valid_ratios) == 0:
+                print("Error: All ratios are NaN or inf")
+                return 0
+            max_ratio = valid_ratios.max()
+        else:
+            max_ratio = ratios.max()
+        
+        result = int(max_ratio)
+        if result == 0:
+            print(f"Warning: Dataset mixture length is 0")
+        return result
+
+    @staticmethod
+    def compute_overall_statistics(
+        per_task_stats: list[dict[str, dict[str, list[float] | np.ndarray]]],
+        dataset_sampling_weights: list[float] | np.ndarray,
+        percentile_mixing_method: str = "weighted_average",
+    ) -> dict[str, dict[str, list[float]]]:
+        """
+        Computes overall statistics from per-task statistics using dataset sample weights.
+
+        Args:
+            per_task_stats: List of per-task statistics.
+            Example format of one element in the per-task statistics list:
+                {
+                    "state.gripper": {
+                        "min": [...],
+                        "max": [...],
+                        "mean": [...],
+                        "std": [...],
+                        "q01": [...],
+                        "q99": [...],
+                    },
+                    ...
+                }
+            dataset_sampling_weights: List of sample weights for each task.
+            percentile_mixing_method: The method to mix the percentiles, either "weighted_average" or "weighted_std".
+
+        Returns:
+            A dict of overall statistics per modality.
+        """
+        # Normalize the sample weights to sum to 1
+        dataset_sampling_weights = np.array(dataset_sampling_weights)
+        normalized_weights = dataset_sampling_weights / dataset_sampling_weights.sum()
+
+        # Initialize overall statistics dict
+        overall_stats: dict[str, dict[str, list[float]]] = {}
+
+        # Get the list of modality keys
+        modality_keys = per_task_stats[0].keys()
+
+        for modality in modality_keys:
+            # Number of dimensions (assuming consistent across tasks)
+            num_dims = len(per_task_stats[0][modality]["mean"])
+
+            # Initialize accumulators for means and variances
+            weighted_means = np.zeros(num_dims)
+            weighted_squares = np.zeros(num_dims)
+
+            # Collect min, max, q01, q99 from all tasks
+            min_list = []
+            max_list = []
+            q01_list = []
+            q99_list = []
+
+            for task_idx, task_stats in enumerate(per_task_stats):
+                w_i = normalized_weights[task_idx]
+                stats = task_stats[modality]
+                means = np.array(stats["mean"])
+                stds = np.array(stats["std"])
+
+                # Update weighted sums for mean and variance
+                weighted_means += w_i * means
+                weighted_squares += w_i * (stds**2 + means**2)
+
+                # Collect min, max, q01, q99
+                min_list.append(stats["min"])
+                max_list.append(stats["max"])
+                q01_list.append(stats["q01"])
+                q99_list.append(stats["q99"])
+
+            # Compute overall mean
+            overall_mean = weighted_means.tolist()
+
+            # Compute overall variance and std deviation
+            overall_variance = weighted_squares - weighted_means**2
+            overall_std = np.sqrt(overall_variance).tolist()
+
+            # Compute overall min and max per dimension
+            overall_min = np.min(np.array(min_list), axis=0).tolist()
+            overall_max = np.max(np.array(max_list), axis=0).tolist()
+
+            # Compute overall q01 and q99 per dimension
+            # Use weighted average of per-task quantiles
+            q01_array = np.array(q01_list)
+            q99_array = np.array(q99_list)
+            if percentile_mixing_method == "weighted_average":
+                weighted_q01 = np.average(q01_array, axis=0, weights=normalized_weights).tolist()
+                weighted_q99 = np.average(q99_array, axis=0, weights=normalized_weights).tolist()
+                # std_q01 = np.std(q01_array, axis=0).tolist()
+                # std_q99 = np.std(q99_array, axis=0).tolist()
+                # print(modality)
+                # print(f"{std_q01=}, {std_q99=}")
+                # print(f"{weighted_q01=}, {weighted_q99=}")
+            elif percentile_mixing_method == "min_max":
+                weighted_q01 = np.min(q01_array, axis=0).tolist()
+                weighted_q99 = np.max(q99_array, axis=0).tolist()
+            else:
+                raise ValueError(f"Invalid percentile mixing method: {percentile_mixing_method}")
+
+            # Store the overall statistics for the modality
+            overall_stats[modality] = {
+                "min": overall_min,
+                "max": overall_max,
+                "mean": overall_mean,
+                "std": overall_std,
+                "q01": weighted_q01,
+                "q99": weighted_q99,
+            }
+
+        return overall_stats
+
+    @staticmethod
+    def merge_metadata(
+        metadatas: list[DatasetMetadata],
+        dataset_sampling_weights: list[float],
+        percentile_mixing_method: str,
+    ) -> DatasetMetadata:
+        """Merge multiple metadata into one."""
+        # Convert to dicts
+        metadata_dicts = [metadata.model_dump(mode="json") for metadata in metadatas]
+        # Create a new metadata dict
+        merged_metadata = {}
+
+        # Check all metadata have the same embodiment tag
+        assert all(
+            metadata.embodiment_tag == metadatas[0].embodiment_tag for metadata in metadatas
+        ), "All metadata must have the same embodiment tag"
+        merged_metadata["embodiment_tag"] = metadatas[0].embodiment_tag
+
+        # Merge the dataset statistics
+        dataset_statistics = {}
+        dataset_statistics["state"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["state"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        dataset_statistics["action"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["action"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        merged_metadata["statistics"] = dataset_statistics
+
+        # Merge the modality configs
+        modality_configs = defaultdict(set)
+        for metadata in metadata_dicts:
+            for modality, configs in metadata["modalities"].items():
+                modality_configs[modality].add(json.dumps(configs))
+        merged_metadata["modalities"] = {}
+        for modality, configs in modality_configs.items():
+            # Check that all modality configs correspond to the same tag matches
+            assert (
+                len(configs) == 1
+            ), f"Multiple modality configs for modality {modality}: {list(configs)}"
+            merged_metadata["modalities"][modality] = json.loads(configs.pop())
+
+        return DatasetMetadata.model_validate(merged_metadata)
+
+    def update_metadata(self, metadata_config: dict, cached_statistics_path: Path | str | None = None) -> None:
+        """
+        Merge multiple metadatas into one and set the transforms with the merged metadata.
+
+        Args:
+            metadata_config (dict): Configuration for the metadata.
+                "percentile_mixing_method": The method to mix the percentiles, either "weighted_average" or "min_max".
+                    weighted_average: Use the weighted average of the percentiles using the weight used in sampling the datasets.
+                    min_max: Use the min of the 1st percentile and max of the 99th percentile.
+        """
+        # If cached path is provided, try to load and apply
+        if cached_statistics_path is not None:
+            try:
+                cached_stats = self.load_merged_statistics(cached_statistics_path)
+                self.apply_cached_statistics(cached_stats)
+                return
+            except (FileNotFoundError, KeyError, ValidationError) as e:
+                print(f"Failed to load cached statistics: {e}")
+                print("Falling back to computing statistics from scratch...")
+
+        self.tag = EmbodimentTag.NEW_EMBODIMENT.value
+        self.merged_metadata: dict[str, DatasetMetadata] = {}
+        # Group metadata by tag
+        all_metadatas: dict[str, list[DatasetMetadata]] = {}
+        for dataset in self.datasets:
+            if dataset.tag not in all_metadatas:
+                all_metadatas[dataset.tag] = []
+            all_metadatas[dataset.tag].append(dataset.metadata)
+        for tag, metadatas in all_metadatas.items():
+            self.merged_metadata[tag] = self.merge_metadata(
+                metadatas=metadatas,
+                dataset_sampling_weights=self.dataset_sampling_weights.tolist(),
+                percentile_mixing_method=metadata_config["percentile_mixing_method"],
+            )
+        for dataset in self.datasets:
+            dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+
+    def save_dataset_statistics(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save merged dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the datasets.
+        Key order follows each tag's modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Keep key orders per embodiment tag (from modality config order)
+        tag_to_used_action_keys = {}
+        tag_to_used_state_keys = {}
+        for dataset in self.datasets:
+            if dataset.tag in tag_to_used_action_keys:
+                continue
+            used_action_keys, used_state_keys = get_used_modality_keys(dataset.modality_keys)
+            tag_to_used_action_keys[dataset.tag] = used_action_keys
+            tag_to_used_state_keys[dataset.tag] = used_state_keys
+        
+        # Organize statistics by tag
+        for tag, merged_metadata in self.merged_metadata.items():
+            tag_stats = {}
+            
+            # Process action statistics
+            if hasattr(merged_metadata.statistics, 'action') and merged_metadata.statistics.action:
+                action_stats = merged_metadata.statistics.action
+                
+                used_action_keys = tag_to_used_action_keys.get(tag, [])
+                filtered_action_stats = {
+                    key: action_stats[key]
+                    for key in used_action_keys
+                    if key in action_stats
+                }
+                
+                if filtered_action_stats:
+                    combined_action_stats = combine_modality_stats(filtered_action_stats)
+                    
+                    mask = generate_action_mask_for_used_keys(
+                        merged_metadata.modalities.action, filtered_action_stats.keys()
+                    )
+                    combined_action_stats["mask"] = mask
+                    
+                    tag_stats["action"] = combined_action_stats
+            
+            # Process state statistics
+            if hasattr(merged_metadata.statistics, 'state') and merged_metadata.statistics.state:
+                state_stats = merged_metadata.statistics.state
+                
+                used_state_keys = tag_to_used_state_keys.get(tag, [])
+                filtered_state_stats = {
+                    key: state_stats[key]
+                    for key in used_state_keys
+                    if key in state_stats
+                }
+                
+                if filtered_state_stats:
+                    combined_state_stats = combine_modality_stats(filtered_state_stats)
+                    tag_stats["state"] = combined_state_stats
+            
+            # Add dataset counts
+            tag_stats.update(self._get_dataset_counts(tag))
+            
+            statistics_data[tag] = tag_stats
+        
+        # Save file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Merged dataset statistics saved to: {save_path}")
+        print(f"Used action keys by tag: {tag_to_used_action_keys}")
+        print(f"Used state keys by tag: {tag_to_used_state_keys}")
+
+
+    def _combine_modality_stats(self, modality_stats: dict) -> dict:
+        """Backward compatibility wrapper."""
+        return combine_modality_stats(modality_stats)
+
+    def _generate_action_mask_for_used_keys(self, action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+        """Backward compatibility wrapper."""
+        return generate_action_mask_for_used_keys(action_modalities, used_action_keys_ordered)
+
+    def _get_dataset_counts(self, tag: str) -> dict:
+        """
+        Get dataset count information for specified tag.
+        
+        Args:
+            tag (str): embodiment tag
+            
+        Returns:
+            dict: Dictionary containing num_transitions and num_trajectories
+        """
+        num_transitions = 0
+        num_trajectories = 0
+        
+        # Count dataset information belonging to this tag
+        for dataset in self.datasets:
+            if dataset.tag == tag:
+                num_transitions += len(dataset)
+                num_trajectories += len(dataset.trajectory_ids)
+        
+        return {
+            "num_transitions": num_transitions,
+            "num_trajectories": num_trajectories
+        }
+
+    @classmethod
+    def load_merged_statistics(cls, load_path: Path | str) -> dict:
+        """
+        Load merged dataset statistics from file.
+        
+        Args:
+            load_path (Path | str): Path to the statistics file
+            
+        Returns:
+            dict: Dictionary containing merged statistics
+        """
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Statistics file not found: {load_path}")
+        
+        if load_path.suffix.lower() == '.json':
+            with open(load_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        elif load_path.suffix.lower() == '.pkl':
+            import pickle
+            with open(load_path, 'rb') as f:
+                return pickle.load(f)
+        else:
+            raise ValueError(f"Unsupported file format: {load_path.suffix}")
+
+    def apply_cached_statistics(self, cached_statistics: dict) -> None:
+        """
+        Apply cached statistics to avoid recomputation.
+        
+        Args:
+            cached_statistics (dict): Statistics loaded from file
+        """
+        # Validate that cached statistics match current datasets
+        if "metadata" in cached_statistics:
+            cached_dataset_names = set(cached_statistics["metadata"]["dataset_names"])
+            current_dataset_names = set(dataset.dataset_name for dataset in self.datasets)
+            
+            if cached_dataset_names != current_dataset_names:
+                print("Warning: Cached statistics dataset names don't match current datasets.")
+                print(f"Cached: {cached_dataset_names}")
+                print(f"Current: {current_dataset_names}")
+                return
+        
+        # Apply cached statistics
+        self.merged_metadata = {}
+        for tag, stats_data in cached_statistics.items():
+            if tag == "metadata":  # Skip metadata field
+                continue
+                
+            # Convert back to DatasetMetadata format
+            metadata_dict = {
+                "embodiment_tag": tag,
+                "statistics": {
+                    "action": {},
+                    "state": {}
+                },
+                "modalities": {}
+            }
+            
+            # Convert action statistics back
+            if "action" in stats_data:
+                action_data = stats_data["action"]
+                # This is simplified - you may need to split back to sub-keys
+                metadata_dict["statistics"]["action"] = action_data
+            
+            # Convert state statistics back
+            if "state" in stats_data:
+                state_data = stats_data["state"]
+                metadata_dict["statistics"]["state"] = state_data
+            
+            self.merged_metadata[tag] = DatasetMetadata.model_validate(metadata_dict)
+        
+        # Update transforms metadata for each dataset
+        for dataset in self.datasets:
+            if dataset.tag in self.merged_metadata:
+                dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+        
+        print(f"Applied cached statistics for {len(self.merged_metadata)} embodiment tags.")
+
diff --git a/code/dataloader/gr00t_lerobot/embodiment_tags.py b/code/dataloader/gr00t_lerobot/embodiment_tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e0376ba4316f0c4f6944f891750bfd58ec45e0a
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/embodiment_tags.py
@@ -0,0 +1,198 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class EmbodimentTag(Enum):
+    GR1 = "gr1"
+    """
+    The GR1 dataset.
+    """
+
+    OXE_DROID = "oxe_droid"
+    """
+    The OxE Droid dataset.
+    """
+
+    OXE_BRIDGE = "oxe_bridge"
+    """
+    The OxE Bridge dataset.
+    """
+
+    OXE_RT1 = "oxe_rt1"
+    """
+    The OxE RT-1 dataset.
+    """
+
+    AGIBOT_GENIE1 = "agibot_genie1"
+    """
+    The AgiBot Genie-1 with gripper dataset.
+    """
+
+    NEW_EMBODIMENT = "new_embodiment"
+    """
+    Any new embodiment for finetuning.
+    """
+
+    FRANKA = 'franka'
+    """
+    The Franka Emika Panda robot.
+    """
+
+    ROBOTWIN = "robotwin"
+    """
+    RobotWin (dual-arm) datasets.
+    """
+
+    REAL_WORLD_FRANKA = "real_world_franka"
+    """
+    The Real-World Franka robot.
+    """
+
+# Embodiment tag string: to projector index in the Action Expert Module
+# EMBODIMENT_TAG_MAPPING = {
+#     EmbodimentTag.NEW_EMBODIMENT.value: 31,
+#     EmbodimentTag.OXE_DROID.value: 17,
+#     EmbodimentTag.OXE_BRIDGE.value: 18,
+#     EmbodimentTag.OXE_RT1.value: 19,
+#     EmbodimentTag.AGIBOT_GENIE1.value: 26,
+#     EmbodimentTag.GR1.value: 24,
+#     EmbodimentTag.FRANKA.value: 25,
+#     EmbodimentTag.ROBOTWIN.value: 27,
+#     EmbodimentTag.REAL_WORLD_FRANKA.value: 28,
+# }
+
+# Robot type to embodiment tag mapping
+ROBOT_TYPE_TO_EMBODIMENT_TAG = {
+    "libero_franka": EmbodimentTag.FRANKA,
+    "oxe_droid": EmbodimentTag.OXE_DROID,
+    "oxe_bridge": EmbodimentTag.OXE_BRIDGE,
+    "oxe_rt1": EmbodimentTag.OXE_RT1,
+    "demo_sim_franka_delta_joints": EmbodimentTag.FRANKA,
+    "custom_robot_config": EmbodimentTag.NEW_EMBODIMENT,
+    "fourier_gr1_arms_waist": EmbodimentTag.GR1,
+    "robotwin": EmbodimentTag.ROBOTWIN,
+    "real_world_franka": EmbodimentTag.REAL_WORLD_FRANKA,
+    }
+
+DATASET_NAME_TO_ID = {
+    # Libero Datasets
+    "libero_object_no_noops_1.0.0_lerobot": 1,
+    "libero_goal_no_noops_1.0.0_lerobot": 1,
+    "libero_spatial_no_noops_1.0.0_lerobot": 1,
+    "libero_10_no_noops_1.0.0_lerobot": 1,
+    "libero_90_no_noops_lerobot": 1,
+
+    # OXE Datasets
+    "bridge_orig_lerobot": 2,
+    "fractal20220817_data_lerobot": 3,
+    "droid_lerobot": 4,
+    "furniture_bench_dataset_lerobot": 5,
+    "taco_play_lerobot": 6,
+
+    # RoboCasa Datasets
+    "gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPCanToDrawerClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPCupToDrawerClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPMilkToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPPotatoToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPWineToCabinetClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToPanSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToPotSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBowlSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToPanSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToPotSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    
+    # robotwin
+    "adjust_bottle": 8,
+    "beat_block_hammer": 8,
+    "blocks_ranking_rgb": 8,
+    "blocks_ranking_size": 8,
+    "click_alarmclock": 8,
+    "click_bell": 8,
+    "dump_bin_bigbin": 8,
+    "grab_roller": 8,
+    "handover_block": 8,
+    "handover_mic": 8,
+    "hanging_mug": 8,
+    "lift_pot": 8,
+    "move_can_pot": 8,
+    "move_pillbottle_pad": 8,
+    "move_playingcard_away": 8,
+    "move_stapler_pad": 8,
+    "open_laptop": 8,
+    "open_microwave": 8,
+    "pick_diverse_bottles": 8,
+    "pick_dual_bottles": 8,
+    "place_a2b_left": 8,
+    "place_a2b_right": 8,
+    "place_bread_basket": 8,
+    "place_bread_skillet": 8,
+    "place_burger_fries": 8,
+    "place_can_basket": 8,
+    "place_cans_plasticbox": 8,
+    "place_container_plate": 8,
+    "place_dual_shoes": 8,
+    "place_empty_cup": 8,
+    "place_fan": 8,
+    "place_mouse_pad": 8,
+    "place_object_basket": 8,
+    "place_object_scale": 8,
+    "place_object_stand": 8,
+    "place_phone_stand": 8,
+    "place_shoe": 8,
+    "press_stapler": 8,
+    "put_bottles_dustbin": 8,
+    "put_object_cabinet": 8,
+    "rotate_qrcode": 8,
+    "scan_object": 8,
+    "shake_bottle_horizontally": 8,
+    "shake_bottle": 8,
+    "stack_blocks_three": 8,
+    "stack_blocks_two": 8,
+    "stack_bowls_three": 8,
+    "stack_bowls_two": 8,
+    "stamp_seal": 8,
+    "turn_switch": 8,
+
+    # real-world
+    "real_grasp_coke": 9,
+    "real_pick_up_cup_in_middle": 9,
+    "real_stack_cups": 9,
+    "real_put_apple_on_tray_and_then_put_banana_on_tray": 9,
+    "realworld_tasks_all": 9,
+    "realworld_4tasks": 9,
+    "realworld_collect": 9,
+    "realworld_pickplace_4tasks": 9,
+}
\ No newline at end of file
diff --git a/code/dataloader/gr00t_lerobot/mixtures.py b/code/dataloader/gr00t_lerobot/mixtures.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dd282a3e65440a3db58abd6baff45de5ce730d0
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/mixtures.py
@@ -0,0 +1,241 @@
+"""
+mixtures.py
+
+Defines a registry of dataset mixtures and weights for the Open-X Embodiment Datasets. Each dataset is associated with
+a float "sampling weight"
+"""
+
+from typing import Dict, List, Tuple
+
+
+# Dataset mixture name mapped to a list of tuples containing:
+## {nakename: [(data_name, sampling_weight, robot_type)] }
+DATASET_NAMED_MIXTURES = {
+
+    "custom_dataset": [
+        ("custom_dataset_name", 1.0, "custom_robot_config"),
+    ],
+    "custom_dataset_2": [
+        ("custom_dataset_name_1", 1.0, "custom_robot_config"),
+        ("custom_dataset_name_2", 1.0, "custom_robot_config"),
+    ],
+
+    "libero_all": [
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+                # ("libero_90_no_noops_lerobot", 1.0, "libero_franka"),
+    ],
+    "bridge": [
+        ("bridge_orig_1.0.0_lerobot", 1.0, "oxe_bridge"),
+    ],
+    "bridge_rt_1": [
+        ("bridge_orig_1.0.0_lerobot", 1.0, "oxe_bridge"),
+        ("fractal20220817_data_0.1.0_lerobot", 1.0, "oxe_rt1"),
+    ],
+
+    "demo_sim_pick_place": [
+        ("sim_pick_place", 1.0, "demo_sim_franka_delta_joints"),
+    ],
+
+    "custom_dataset": [
+        ("custom_dataset_name", 1.0, "custom_robot_config"),
+    ],
+    "custom_dataset_2": [
+        ("custom_dataset_name_1", 1.0, "custom_robot_config"),
+        ("custom_dataset_name_2", 1.0, "custom_robot_config"),
+    ],
+
+    "fourier_gr1_unified_1000": [
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPCanToDrawerClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPCupToDrawerClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPMilkToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPPotatoToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPWineToCabinetClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToPanSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToPotSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBowlSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToPanSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToPotSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+    ],
+
+    "BEHAVIOR_challenge": [
+        ("BEHAVIOR_challenge", 1.0, "R1Pro"),
+    ],
+
+
+    "SO101_pick": [
+        ("pick_dataset_name", 1.0, "SO101"),
+    ],
+
+    "arx_x5": [
+        ("arx_x5", 1.0, "arx_x5"),
+    ],
+
+    "robotwin": [
+        ("adjust_bottle", 1.0, "robotwin"),
+        ("beat_block_hammer", 1.0, "robotwin"),
+        ("blocks_ranking_rgb", 1.0, "robotwin"),
+        ("blocks_ranking_size", 1.0, "robotwin"),
+        ("click_alarmclock", 1.0, "robotwin"),
+        ("click_bell", 1.0, "robotwin"),
+        ("dump_bin_bigbin", 1.0, "robotwin"),
+        ("grab_roller", 1.0, "robotwin"),
+        ("handover_block", 1.0, "robotwin"),
+        ("handover_mic", 1.0, "robotwin"),
+        ("hanging_mug", 1.0, "robotwin"),
+        ("lift_pot", 1.0, "robotwin"),
+        ("move_can_pot", 1.0, "robotwin"),
+        ("move_pillbottle_pad", 1.0, "robotwin"),
+        ("move_playingcard_away", 1.0, "robotwin"),
+        ("move_stapler_pad", 1.0, "robotwin"),
+        ("open_laptop", 1.0, "robotwin"),
+        ("open_microwave", 1.0, "robotwin"),
+        ("pick_diverse_bottles", 1.0, "robotwin"),
+        ("pick_dual_bottles", 1.0, "robotwin"),
+        ("place_a2b_left", 1.0, "robotwin"),
+        ("place_a2b_right", 1.0, "robotwin"),
+        ("place_bread_basket", 1.0, "robotwin"),
+        ("place_bread_skillet", 1.0, "robotwin"),
+        ("place_burger_fries", 1.0, "robotwin"),
+        ("place_can_basket", 1.0, "robotwin"),
+        ("place_cans_plasticbox", 1.0, "robotwin"),
+        ("place_container_plate", 1.0, "robotwin"),
+        ("place_dual_shoes", 1.0, "robotwin"),
+        ("place_empty_cup", 1.0, "robotwin"),
+        ("place_fan", 1.0, "robotwin"),
+        ("place_mouse_pad", 1.0, "robotwin"),
+        ("place_object_basket", 1.0, "robotwin"),
+        ("place_object_scale", 1.0, "robotwin"),
+        ("place_object_stand", 1.0, "robotwin"),
+        ("place_phone_stand", 1.0, "robotwin"),
+        ("place_shoe", 1.0, "robotwin"),
+        ("press_stapler", 1.0, "robotwin"),
+        ("put_bottles_dustbin", 1.0, "robotwin"),
+        ("put_object_cabinet", 1.0, "robotwin"),
+        ("rotate_qrcode", 1.0, "robotwin"),
+        ("scan_object", 1.0, "robotwin"),
+        ("shake_bottle", 1.0, "robotwin"),
+        ("shake_bottle_horizontally", 1.0, "robotwin"),
+        ("stack_blocks_three", 1.0, "robotwin"),
+        ("stack_blocks_two", 1.0, "robotwin"),
+        ("stack_bowls_three", 1.0, "robotwin"),
+        ("stack_bowls_two", 1.0, "robotwin"),
+        ("stamp_seal", 1.0, "robotwin"),
+        ("turn_switch", 1.0, "robotwin"),
+    ],
+    "cross_embodiedment_17tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # robotwin - 8 tasks, selected by average trajectory length, 400, 500, 600, 700, 800, 900, 900, 1200
+        ("beat_block_hammer", 1.0, "robotwin"),  # 
+        ("place_shoe", 1.0, "robotwin"),  # 
+        ("dump_bin_bigbin", 1.0, "robotwin"),  # 
+        ("put_object_cabinet", 1.0, "robotwin"),  # 
+        ("stack_blocks_two", 1.0, "robotwin"),  # 
+        ("stack_bowls_two", 1.0, "robotwin"),  # 
+        ("shake_bottle", 1.0, "robotwin"),  # 
+        ("hanging_mug", 1.0, "robotwin"),  # 
+        # ("blocks_ranking_rgb", 1.0, "robotwin"),  # 
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+    ],
+    "cross_embodiedment_21tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # robotwin - 8 tasks, selected by average trajectory length, 400, 500, 600, 700, 800, 900, 900, 1200
+        ("beat_block_hammer", 1.0, "robotwin"),  # 
+        ("place_shoe", 1.0, "robotwin"),  # 
+        ("dump_bin_bigbin", 1.0, "robotwin"),  # 
+        ("put_object_cabinet", 1.0, "robotwin"),  # 
+        ("stack_blocks_two", 1.0, "robotwin"),  # 
+        ("stack_bowls_two", 1.0, "robotwin"),  # 
+        ("shake_bottle", 1.0, "robotwin"),  # 
+        ("hanging_mug", 1.0, "robotwin"),  # 
+        # ("blocks_ranking_rgb", 1.0, "robotwin"),  # 
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+        # real-world - 4 tasks
+        ("realworld_4tasks", 1.0, "real_world_franka"),
+    ],
+    "real_world_4tasks": [
+        ("realworld_4tasks", 1.0, "real_world_franka"),
+    ],
+    "realworld_tasks_all": [
+        ("realworld_tasks_all", 1.0, "real_world_franka"),
+    ],
+    "realworld_collect": [
+        ("realworld_collect", 1.0, "real_world_franka"),
+    ],
+    "cross_embodiedment_13tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+        # real-world - 4 tasks
+        ("realworld_pickplace_4tasks", 1.0, "real_world_franka"),
+    ],
+    "cross_embodiedment_simulator": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+    ],
+    "cross_embodiedment_simulator_moredata": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        ("libero_90_no_noops_lerobot", 1.0, "libero_franka"),  # 901020
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),  # 71341 x 5
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),  # 48282 x 5
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),  # 48066 x 5
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),  # 41518 x 5
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),  # 39739 x 5
+    ],
+}
diff --git a/code/dataloader/gr00t_lerobot/schema.py b/code/dataloader/gr00t_lerobot/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..64519e56a3c59e5d08c8f8f6370f640061859b4d
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/schema.py
@@ -0,0 +1,221 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Optional
+
+from numpydantic import NDArray
+from pydantic import BaseModel, Field, field_serializer
+
+from .embodiment_tags import EmbodimentTag
+
+# Common schema
+
+
+class RotationType(Enum):
+    """Type of rotation representation"""
+
+    AXIS_ANGLE = "axis_angle"
+    QUATERNION = "quaternion"
+    ROTATION_6D = "rotation_6d"
+    MATRIX = "matrix"
+    EULER_ANGLES_RPY = "euler_angles_rpy"
+    EULER_ANGLES_RYP = "euler_angles_ryp"
+    EULER_ANGLES_PRY = "euler_angles_pry"
+    EULER_ANGLES_PYR = "euler_angles_pyr"
+    EULER_ANGLES_YRP = "euler_angles_yrp"
+    EULER_ANGLES_YPR = "euler_angles_ypr"
+
+
+# LeRobot schema
+
+
+class LeRobotModalityField(BaseModel):
+    """Metadata for a LeRobot modality field."""
+
+    original_key: Optional[str] = Field(
+        default=None,
+        description="The original key of the modality in the LeRobot dataset",
+    )
+
+
+class LeRobotStateActionMetadata(LeRobotModalityField):
+    """Metadata for a LeRobot modality."""
+
+    start: int = Field(
+        ...,
+        description="The start index of the modality in the concatenated state/action vector",
+    )
+    end: int = Field(
+        ...,
+        description="The end index of the modality in the concatenated state/action vector",
+    )
+    rotation_type: Optional[RotationType] = Field(
+        default=None, description="The type of rotation for the modality"
+    )
+    absolute: bool = Field(default=True, description="Whether the modality is absolute")
+    dtype: str = Field(
+        default="float64",
+        description="The data type of the modality. Defaults to float64.",
+    )
+    range: Optional[tuple[float, float]] = Field(
+        default=None,
+        description="The range of the modality, if applicable. Defaults to None.",
+    )
+    original_key: Optional[str] = Field(
+        default=None,
+        description="The original key of the modality in the LeRobot dataset.",
+    )
+
+
+class LeRobotStateMetadata(LeRobotStateActionMetadata):
+    """Metadata for a LeRobot state modality."""
+
+    original_key: Optional[str] = Field(
+        default="observation.state",  # LeRobot convention for states
+        description="The original key of the state modality in the LeRobot dataset",
+    )
+
+
+class LeRobotActionMetadata(LeRobotStateActionMetadata):
+    """Metadata for a LeRobot action modality."""
+
+    original_key: Optional[str] = Field(
+        default="action",  # LeRobot convention for actions
+        description="The original key of the action modality in the LeRobot dataset",
+    )
+
+
+class LeRobotModalityMetadata(BaseModel):
+    """Metadata for a LeRobot modality."""
+
+    state: dict[str, LeRobotStateMetadata] = Field(
+        ...,
+        description="The metadata for the state modality. The keys are the names of each split of the state vector.",
+    )
+    action: dict[str, LeRobotActionMetadata] = Field(
+        ...,
+        description="The metadata for the action modality. The keys are the names of each split of the action vector.",
+    )
+    video: dict[str, LeRobotModalityField] = Field(
+        ...,
+        description="The metadata for the video modality. The keys are the new names of each video modality.",
+    )
+    annotation: Optional[dict[str, LeRobotModalityField]] = Field(
+        default=None,
+        description="The metadata for the annotation modality. The keys are the new names of each annotation modality.",
+    )
+
+    def get_key_meta(self, key: str) -> LeRobotModalityField:
+        """Get the metadata for a key in the LeRobot modality metadata.
+
+        Args:
+            key (str): The key to get the metadata for.
+
+        Returns:
+            LeRobotModalityField: The metadata for the key.
+
+        Example:
+            lerobot_modality_meta = LeRobotModalityMetadata.model_validate(U.load_json(modality_meta_path))
+            lerobot_modality_meta.get_key_meta("state.joint_shoulder_y")
+            lerobot_modality_meta.get_key_meta("video.main_camera")
+            lerobot_modality_meta.get_key_meta("annotation.human.action.task_description")
+        """
+        split_key = key.split(".")
+        modality = split_key[0]
+        subkey = ".".join(split_key[1:])
+        if modality == "state":
+            if subkey not in self.state:
+                raise ValueError(
+                    f"Key: {key}, state key {subkey} not found in metadata, available state keys: {self.state.keys()}"
+                )
+            return self.state[subkey]
+        elif modality == "action":
+            if subkey not in self.action:
+                raise ValueError(
+                    f"Key: {key}, action key {subkey} not found in metadata, available action keys: {self.action.keys()}"
+                )
+            return self.action[subkey]
+        elif modality == "video":
+            if subkey not in self.video:
+                raise ValueError(
+                    f"Key: {key}, video key {subkey} not found in metadata, available video keys: {self.video.keys()}"
+                )
+            return self.video[subkey]
+        elif modality == "annotation":
+            assert (
+                self.annotation is not None
+            ), "Trying to get annotation metadata for a dataset with no annotations"
+            if subkey not in self.annotation:
+                raise ValueError(
+                    f"Key: {key}, annotation key {subkey} not found in metadata, available annotation keys: {self.annotation.keys()}"
+                )
+            return self.annotation[subkey]
+        else:
+            raise ValueError(f"Key: {key}, unexpected modality: {modality}")
+
+
+# Dataset schema (parsed from LeRobot schema and simplified)
+
+
+class DatasetStatisticalValues(BaseModel):
+    max: NDArray = Field(..., description="Maximum values")
+    min: NDArray = Field(..., description="Minimum values")
+    mean: NDArray = Field(..., description="Mean values")
+    std: NDArray = Field(..., description="Standard deviation")
+    q01: NDArray = Field(..., description="1st percentile values")
+    q99: NDArray = Field(..., description="99th percentile values")
+
+    @field_serializer("*", when_used="json")
+    def serialize_ndarray(self, v: NDArray) -> list[float]:
+        return v.tolist()  # type: ignore
+
+
+class DatasetStatistics(BaseModel):
+    state: dict[str, DatasetStatisticalValues] = Field(..., description="Statistics of the state")
+    action: dict[str, DatasetStatisticalValues] = Field(..., description="Statistics of the action")
+
+
+class VideoMetadata(BaseModel):
+    """Metadata of the video modality"""
+
+    resolution: tuple[int, int] = Field(..., description="Resolution of the video")
+    channels: int = Field(..., description="Number of channels in the video", gt=0)
+    fps: float = Field(..., description="Frames per second", gt=0)
+
+
+class StateActionMetadata(BaseModel):
+    absolute: bool = Field(..., description="Whether the state or action is absolute")
+    rotation_type: Optional[RotationType] = Field(None, description="Type of rotation, if any")
+    shape: tuple[int, ...] = Field(..., description="Shape of the state or action")
+    continuous: bool = Field(..., description="Whether the state or action is continuous")
+
+
+class DatasetModalities(BaseModel):
+    video: dict[str, VideoMetadata] = Field(..., description="Metadata of the video")
+    state: dict[str, StateActionMetadata] = Field(..., description="Metadata of the state")
+    action: dict[str, StateActionMetadata] = Field(..., description="Metadata of the action")
+
+
+class DatasetMetadata(BaseModel):
+    """Metadata of the trainable dataset
+
+    Changes:
+        - Update to use the new RawCommitHashMetadataMetadata_V1_2
+    """
+
+    statistics: DatasetStatistics = Field(..., description="Statistics of the dataset")
+    modalities: DatasetModalities = Field(..., description="Metadata of the modalities")
+    embodiment_tag: EmbodimentTag = Field(..., description="Embodiment tag of the dataset")
\ No newline at end of file
diff --git a/code/dataloader/gr00t_lerobot/transform/__init__.py b/code/dataloader/gr00t_lerobot/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf22b5e4ca3d1c7937a25234cbf08e2644593587
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/transform/__init__.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import (
+    ComposedModalityTransform,
+    InvertibleModalityTransform,
+    ModalityTransform,
+)
+from .concat import ConcatTransform
+# from .state_action import (
+#     StateActionDropout,
+#     StateActionPerturbation,
+#     StateActionSinCosTransform,
+#     StateActionToTensor,
+#     StateActionTransform,
+# )
+from .video import (
+    VideoColorJitter,
+    VideoCrop,
+    VideoGrayscale,
+    VideoHorizontalFlip,
+    VideoRandomGrayscale,
+    VideoRandomPosterize,
+    VideoRandomRotation,
+    VideoResize,
+    VideoToNumpy,
+    VideoToTensor,
+    VideoTransform,
+)
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bb6b78fd530deecd32d433f87df53df03795d79
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-311.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce43a586fe74874c6615489ebd4e023fde731959
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a3c6c6e0302d9bd17d0291b7e3d7e22b7f85d2b
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-311.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad0b45da5432f326c2d3baa1dbab7ccb961aa9a7
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/base.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c029431c2bd59ddcff7e95c263646b6bb97023f
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-311.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac8ffb7df701e713c72e5792d43fcf79beef8a98
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/concat.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..434e5b2d121084471d1fc7868a168978869527cd
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-311.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..971f5db4ee3c7ed4593366370b177bcb88e57679
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/state_action.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eebb8c0e5de56c6f6f1301c506e368ca5986ff9
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-311.pyc b/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b3a1366881b0af2248f142e90cf360dcfdedfce
Binary files /dev/null and b/code/dataloader/gr00t_lerobot/transform/__pycache__/video.cpython-311.pyc differ
diff --git a/code/dataloader/gr00t_lerobot/transform/base.py b/code/dataloader/gr00t_lerobot/transform/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac88559af98fa23f34fbb9135775d0819c281ef
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/transform/base.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
+
+from ..schema import DatasetMetadata
+
+
+class ModalityTransform(BaseModel, ABC):
+    """
+    Abstract class for transforming data modalities, e.g. video frame augmentation or action normalization.
+    """
+
+    apply_to: list[str] = Field(..., description="The keys to apply the transform to.")
+    training: bool = Field(
+        default=True, description="Whether to apply the transform in training mode."
+    )
+    _dataset_metadata: DatasetMetadata | None = PrivateAttr(default=None)
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    @property
+    def dataset_metadata(self) -> DatasetMetadata:
+        assert (
+            self._dataset_metadata is not None
+        ), "Dataset metadata is not set. Please call set_metadata() before calling apply()."
+        return self._dataset_metadata
+
+    @dataset_metadata.setter
+    def dataset_metadata(self, value: DatasetMetadata):
+        self._dataset_metadata = value
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        """
+        Set the dataset metadata. This is useful for transforms that need to know the dataset metadata, e.g. to normalize actions.
+        Subclasses can override this method if they need to do something more complex.
+        """
+        self.dataset_metadata = dataset_metadata
+
+    def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Apply the transformation to the data corresponding to target_keys and return the processed data.
+
+        Args:
+            data (dict[str, Any]): The data to transform.
+                example: data = {
+                    "video.image_side_0": np.ndarray,
+                    "action.eef_position": np.ndarray,
+                    ...
+                }
+
+        Returns:
+            dict[str, Any]: The transformed data.
+                example: transformed_data = {
+                    "video.image_side_0": np.ndarray,
+                    "action.eef_position": torch.Tensor,  # Normalized and converted to tensor
+                    ...
+                }
+        """
+        return self.apply(data)
+
+    @abstractmethod
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Apply the transformation to the data corresponding to keys matching the `apply_to` regular expression and return the processed data."""
+
+    def train(self):
+        self.training = True
+
+    def eval(self):
+        self.training = False
+
+
+class InvertibleModalityTransform(ModalityTransform):
+    @abstractmethod
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Reverse the transformation to the data corresponding to keys matching the `apply_to` regular expression and return the processed data."""
+
+
+class ComposedModalityTransform(ModalityTransform):
+    """Compose multiple modality transforms."""
+
+    transforms: list[ModalityTransform] = Field(..., description="The transforms to compose.")
+    apply_to: list[str] = Field(
+        default_factory=list, description="Will be ignored for composed transforms."
+    )
+    training: bool = Field(
+        default=True, description="Whether to apply the transform in training mode."
+    )
+
+    model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        for transform in self.transforms:
+            transform.set_metadata(dataset_metadata)
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for i, transform in enumerate(self.transforms):
+            try:
+                data = transform(data)
+            except Exception as e:
+                raise ValueError(f"Error applying transform {i} to data: {e}") from e
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for i, transform in enumerate(reversed(self.transforms)):
+            if isinstance(transform, InvertibleModalityTransform):
+                try:
+                    data = transform.unapply(data)
+                except Exception as e:
+                    step = len(self.transforms) - i - 1
+                    raise ValueError(f"Error unapplying transform {step} to data: {e}") from e
+        return data
+
+    def train(self):
+        for transform in self.transforms:
+            transform.train()
+
+    def eval(self):
+        for transform in self.transforms:
+            transform.eval()
diff --git a/code/dataloader/gr00t_lerobot/transform/concat.py b/code/dataloader/gr00t_lerobot/transform/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf8eea4c77fc163ecdb0d25aeca26a2cde99f8c4
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/transform/concat.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+import torch
+from pydantic import Field
+
+from ..schema import DatasetMetadata, StateActionMetadata
+from .base import InvertibleModalityTransform
+
+
+class ConcatTransform(InvertibleModalityTransform):
+    """
+    Concatenate the keys according to specified order.
+    """
+
+    # -- We inherit from ModalityTransform, so we keep apply_to as well --
+    apply_to: list[str] = Field(
+        default_factory=list, description="Not used in this transform, kept for compatibility."
+    )
+
+    video_concat_order: list[str] = Field(
+        ...,
+        description="Concatenation order for each video modality. "
+        "Format: ['video.ego_view_pad_res224_freq20', ...]",
+    )
+
+    state_concat_order: Optional[list[str]] = Field(
+        default=None,
+        description="Concatenation order for each state modality. "
+        "Format: ['state.position', 'state.velocity', ...].",
+    )
+
+    action_concat_order: Optional[list[str]] = Field(
+        default=None,
+        description="Concatenation order for each action modality. "
+        "Format: ['action.position', 'action.velocity', ...].",
+    )
+
+    action_dims: dict[str, int] = Field(
+        default_factory=dict,
+        description="The dimensions of the action keys.",
+    )
+    state_dims: dict[str, int] = Field(
+        default_factory=dict,
+        description="The dimensions of the state keys.",
+    )
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {
+                "apply_to",
+                "video_concat_order",
+                "state_concat_order",
+                "action_concat_order",
+            }
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    def apply(self, data: dict) -> dict:
+        grouped_keys = {}
+        for key in data.keys():
+            try:
+                modality, _ = key.split(".")
+            except:  # noqa: E722
+                ### Handle language annotation special case
+                if "annotation" in key:
+                    modality = "language"
+                else:
+                    modality = "others"
+            if modality not in grouped_keys:
+                grouped_keys[modality] = []
+            grouped_keys[modality].append(key)
+
+        if "video" in grouped_keys:
+            # Check if keys in video_concat_order, state_concat_order, action_concat_order are
+            # ineed contained in the data. If not, then the keys are misspecified
+            video_keys = grouped_keys["video"]
+            assert self.video_concat_order is not None, f"{self.video_concat_order=}, {video_keys=}"
+            assert all(
+                item in video_keys for item in self.video_concat_order
+            ), f"keys in video_concat_order are misspecified, \n{video_keys=}, \n{self.video_concat_order=}"
+
+            # Process each video view
+            unsqueezed_videos = []
+            for video_key in self.video_concat_order:
+                video_data = data.pop(video_key)
+                unsqueezed_video = np.expand_dims(
+                    video_data, axis=-4
+                )  # [..., H, W, C] -> [..., 1, H, W, C]
+                unsqueezed_videos.append(unsqueezed_video)
+            # Concatenate along the new axis
+            unsqueezed_video = np.concatenate(unsqueezed_videos, axis=-4)  # [..., V, H, W, C]
+
+            # Video
+            data["video"] = unsqueezed_video
+
+        # "state"
+        if "state" in grouped_keys:
+            state_keys = grouped_keys["state"]
+            assert self.state_concat_order is not None, f"{self.state_concat_order=}"
+            assert all(
+                item in state_keys for item in self.state_concat_order
+            ), f"keys in state_concat_order are misspecified, \n{state_keys=}, \n{self.state_concat_order=}"
+            # Check the state dims
+            for key in self.state_concat_order:
+                target_shapes = [self.state_dims[key]]
+                if self.is_rotation_key(key):
+                    target_shapes.append(6)  # Allow for rotation_6d
+                # if key in ["state.right_arm", "state.right_hand"]:
+                target_shapes.append(self.state_dims[key] * 2)  # Allow for sin-cos transform
+                assert (
+                    data[key].shape[-1] in target_shapes
+                ), f"State dim mismatch for {key=}, {data[key].shape[-1]=}, {target_shapes=}"
+            # Concatenate the state keys
+            # We'll have StateActionToTensor before this transform, so here we use torch.cat
+            data["state"] = torch.cat(
+                [data.pop(key) for key in self.state_concat_order], dim=-1
+            )  # [T, D_state]
+
+        if "action" in grouped_keys:
+            action_keys = grouped_keys["action"]
+            assert self.action_concat_order is not None, f"{self.action_concat_order=}"
+            # Check if all keys in concat_order are present
+            assert set(self.action_concat_order) == set(
+                action_keys
+            ), f"{set(self.action_concat_order)=}, {set(action_keys)=}"
+            # Record the action dims
+            for key in self.action_concat_order:
+                target_shapes = [self.action_dims[key]]
+                if self.is_rotation_key(key):
+                    target_shapes.append(3)  # Allow for axis angle
+                assert (
+                    self.action_dims[key] == data[key].shape[-1]
+                ), f"Action dim mismatch for {key=}, {self.action_dims[key]=}, {data[key].shape[-1]=}"
+            # Concatenate the action keys
+            # We'll have StateActionToTensor before this transform, so here we use torch.cat
+            data["action"] = torch.cat(
+                [data.pop(key) for key in self.action_concat_order], dim=-1
+            )  # [T, D_action]
+
+        return data
+
+    def unapply(self, data: dict) -> dict:
+        start_dim = 0
+        assert "action" in data, f"{data.keys()=}"
+        # For those dataset without actions (LAPA), we'll never run unapply
+        assert self.action_concat_order is not None, f"{self.action_concat_order=}"
+        action_tensor = data.pop("action")
+        for key in self.action_concat_order:
+            if key not in self.action_dims:
+                raise ValueError(f"Action dim {key} not found in action_dims.")
+            end_dim = start_dim + self.action_dims[key]
+            data[key] = action_tensor[..., start_dim:end_dim]
+            start_dim = end_dim
+        if "state" in data:
+            assert self.state_concat_order is not None, f"{self.state_concat_order=}"
+            start_dim = 0
+            state_tensor = data.pop("state")
+            for key in self.state_concat_order:
+                end_dim = start_dim + self.state_dims[key]
+                data[key] = state_tensor[..., start_dim:end_dim]
+                start_dim = end_dim
+        return data
+
+    def __call__(self, data: dict) -> dict:
+        return self.apply(data)
+
+    def get_modality_metadata(self, key: str) -> StateActionMetadata:
+        modality, subkey = key.split(".")
+        assert self.dataset_metadata is not None, "Metadata not set"
+        modality_config = getattr(self.dataset_metadata.modalities, modality)
+        assert subkey in modality_config, f"{subkey=} not found in {modality_config=}"
+        assert isinstance(
+            modality_config[subkey], StateActionMetadata
+        ), f"Expected {StateActionMetadata} for {subkey=}, got {type(modality_config[subkey])=}"
+        return modality_config[subkey]
+
+    def get_state_action_dims(self, key: str) -> int:
+        """Get the dimension of a state or action key from the dataset metadata."""
+        modality_config = self.get_modality_metadata(key)
+        shape = modality_config.shape
+        assert len(shape) == 1, f"{shape=}"
+        return shape[0]
+
+    def is_rotation_key(self, key: str) -> bool:
+        modality_config = self.get_modality_metadata(key)
+        return modality_config.rotation_type is not None
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        """Set the metadata and compute the dimensions of the state and action keys."""
+        super().set_metadata(dataset_metadata)
+        # Pre-compute the dimensions of the state and action keys
+        if self.action_concat_order is not None:
+            for key in self.action_concat_order:
+                self.action_dims[key] = self.get_state_action_dims(key)
+        if self.state_concat_order is not None:
+            for key in self.state_concat_order:
+                self.state_dims[key] = self.get_state_action_dims(key)
diff --git a/code/dataloader/gr00t_lerobot/transform/state_action.py b/code/dataloader/gr00t_lerobot/transform/state_action.py
new file mode 100644
index 0000000000000000000000000000000000000000..a01d5f7c39903e3e78f4d92e6f901d93a99707e1
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/transform/state_action.py
@@ -0,0 +1,606 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import random
+from typing import Any, ClassVar
+
+import numpy as np
+import pytorch3d.transforms as pt
+import torch
+from pydantic import Field, PrivateAttr, field_validator, model_validator
+
+from ..schema import DatasetMetadata, RotationType, StateActionMetadata
+from .base import InvertibleModalityTransform, ModalityTransform
+
+
+class RotationTransform:
+    """Adapted from https://github.com/real-stanford/diffusion_policy/blob/548a52bbb105518058e27bf34dcf90bf6f73681a/diffusion_policy/model/common/rotation_transformer.py"""
+
+    valid_reps = ["axis_angle", "euler_angles", "quaternion", "rotation_6d", "matrix"]
+
+    def __init__(self, from_rep="axis_angle", to_rep="rotation_6d"):
+        """
+        Valid representations
+
+        Always use matrix as intermediate representation.
+        """
+        if from_rep.startswith("euler_angles"):
+            from_convention = from_rep.split("_")[-1]
+            from_rep = "euler_angles"
+            from_convention = from_convention.replace("r", "X").replace("p", "Y").replace("y", "Z")
+        else:
+            from_convention = None
+        if to_rep.startswith("euler_angles"):
+            to_convention = to_rep.split("_")[-1]
+            to_rep = "euler_angles"
+            to_convention = to_convention.replace("r", "X").replace("p", "Y").replace("y", "Z")
+        else:
+            to_convention = None
+        assert from_rep != to_rep, f"from_rep and to_rep cannot be the same: {from_rep}"
+        assert from_rep in self.valid_reps, f"Invalid from_rep: {from_rep}"
+        assert to_rep in self.valid_reps, f"Invalid to_rep: {to_rep}"
+
+        forward_funcs = list()
+        inverse_funcs = list()
+
+        if from_rep != "matrix":
+            funcs = [getattr(pt, f"{from_rep}_to_matrix"), getattr(pt, f"matrix_to_{from_rep}")]
+            if from_convention is not None:
+                funcs = [functools.partial(func, convention=from_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        if to_rep != "matrix":
+            funcs = [getattr(pt, f"matrix_to_{to_rep}"), getattr(pt, f"{to_rep}_to_matrix")]
+            if to_convention is not None:
+                funcs = [functools.partial(func, convention=to_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        inverse_funcs = inverse_funcs[::-1]
+
+        self.forward_funcs = forward_funcs
+        self.inverse_funcs = inverse_funcs
+
+    @staticmethod
+    def _apply_funcs(x: torch.Tensor, funcs: list) -> torch.Tensor:
+        assert isinstance(x, torch.Tensor)
+        for func in funcs:
+            x = func(x)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        return self._apply_funcs(x, self.forward_funcs)
+
+    def inverse(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        return self._apply_funcs(x, self.inverse_funcs)
+
+
+class Normalizer:
+    valid_modes = ["q99", "mean_std", "min_max", "binary"]
+
+    def __init__(self, mode: str, statistics: dict):
+        self.mode = mode
+        self.statistics = statistics
+        for key, value in self.statistics.items():
+            self.statistics[key] = torch.tensor(value)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+
+        # Normalize the tensor
+        if self.mode == "q99":
+            # Range of q99 is [-1, 1]
+            q01 = self.statistics["q01"].to(x.dtype)
+            q99 = self.statistics["q99"].to(x.dtype)
+
+            # In the case of q01 == q99, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = q01 != q99
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where q01 != q99
+            # Formula: 2 * (x - q01) / (q99 - q01) - 1
+            normalized[..., mask] = (x[..., mask] - q01[..., mask]) / (
+                q99[..., mask] - q01[..., mask]
+            )
+            normalized[..., mask] = 2 * normalized[..., mask] - 1
+
+            # Set the normalized values to the original values where q01 == q99
+            normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+
+            # Clip the normalized values to be between -1 and 1
+            normalized = torch.clamp(normalized, -1, 1)
+
+        elif self.mode == "mean_std":
+            # Range of mean_std is not fixed, but can be positive or negative
+            mean = self.statistics["mean"].to(x.dtype)
+            std = self.statistics["std"].to(x.dtype)
+
+            # In the case of std == 0, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = std != 0
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where std != 0
+            # Formula: (x - mean) / std
+            normalized[..., mask] = (x[..., mask] - mean[..., mask]) / std[..., mask]
+
+            # Set the normalized values to the original values where std == 0
+            normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+
+        elif self.mode == "min_max":
+            # Range of min_max is [-1, 1]
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+
+            # In the case of min == max, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = min != max
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where min != max
+            # Formula: 2 * (x - min) / (max - min) - 1
+            normalized[..., mask] = (x[..., mask] - min[..., mask]) / (
+                max[..., mask] - min[..., mask]
+            )
+            normalized[..., mask] = 2 * normalized[..., mask] - 1
+
+            # Set the normalized values to the original values where min == max
+            # normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+            # Set the normalized values to 0 where min == max
+            normalized[..., ~mask] = 0
+
+        elif self.mode == "scale":
+            # Range of scale is [0, 1]
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+            abs_max = torch.max(torch.abs(min), torch.abs(max))
+            mask = abs_max != 0
+            normalized = torch.zeros_like(x)
+            normalized[..., mask] = x[..., mask] / abs_max[..., mask]
+            normalized[..., ~mask] = 0
+
+        elif self.mode == "binary":
+            # Range of binary is [0, 1]
+            normalized = (x > 0.5).to(x.dtype)
+        else:
+            raise ValueError(f"Invalid normalization mode: {self.mode}")
+
+        return normalized
+
+    def inverse(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        if self.mode == "q99":
+            q01 = self.statistics["q01"].to(x.dtype)
+            q99 = self.statistics["q99"].to(x.dtype)
+            return (x + 1) / 2 * (q99 - q01) + q01
+        elif self.mode == "mean_std":
+            mean = self.statistics["mean"].to(x.dtype)
+            std = self.statistics["std"].to(x.dtype)
+            return x * std + mean
+        elif self.mode == "min_max":
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+            return (x + 1) / 2 * (max - min) + min
+        elif self.mode == "binary":
+            return (x > 0.5).to(x.dtype)
+        else:
+            raise ValueError(f"Invalid normalization mode: {self.mode}")
+
+
+class StateActionToTensor(InvertibleModalityTransform):
+    """
+    Transforms states and actions to tensors.
+    """
+
+    input_dtypes: dict[str, np.dtype] = Field(
+        default_factory=dict, description="The input dtypes for each state key."
+    )
+    output_dtypes: dict[str, torch.dtype] = Field(
+        default_factory=dict, description="The output dtypes for each state key."
+    )
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {"apply_to"}
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    @field_validator("input_dtypes", "output_dtypes", mode="before")
+    def validate_dtypes(cls, v):
+        for key, dtype in v.items():
+            if isinstance(dtype, str):
+                if dtype.startswith("torch."):
+                    dtype_split = dtype.split(".")[-1]
+                    v[key] = getattr(torch, dtype_split)
+                elif dtype.startswith("np.") or dtype.startswith("numpy."):
+                    dtype_split = dtype.split(".")[-1]
+                    v[key] = np.dtype(dtype_split)
+                else:
+                    raise ValueError(f"Invalid dtype: {dtype}")
+        return v
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            value = data[key]
+            assert isinstance(
+                value, np.ndarray
+            ), f"Unexpected input type: {type(value)}. Expected type: {np.ndarray}"
+            data[key] = torch.from_numpy(value)
+            if key in self.output_dtypes:
+                data[key] = data[key].to(self.output_dtypes[key])
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            value = data[key]
+            assert isinstance(
+                value, torch.Tensor
+            ), f"Unexpected input type: {type(value)}. Expected type: {torch.Tensor}"
+            data[key] = value.numpy()
+            if key in self.input_dtypes:
+                data[key] = data[key].astype(self.input_dtypes[key])
+        return data
+
+
+class StateActionTransform(InvertibleModalityTransform):
+    """
+    Class for state or action transform.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        normalization_modes (dict[str, str]): The normalization modes for each state key.
+            If a state key in apply_to is not present in the dictionary, it will not be normalized.
+        target_rotations (dict[str, str]): The target representations for each state key.
+            If a state key in apply_to is not present in the dictionary, it will not be rotated.
+    """
+
+    # Configurable attributes
+    apply_to: list[str] = Field(..., description="The keys in the modality to load and transform.")
+    normalization_modes: dict[str, str] = Field(
+        default_factory=dict, description="The normalization modes for each state key."
+    )
+    target_rotations: dict[str, str] = Field(
+        default_factory=dict, description="The target representations for each state key."
+    )
+    normalization_statistics: dict[str, dict] = Field(
+        default_factory=dict, description="The statistics for each state key."
+    )
+    modality_metadata: dict[str, StateActionMetadata] = Field(
+        default_factory=dict, description="The modality metadata for each state key."
+    )
+
+    # Model variables
+    _rotation_transformers: dict[str, RotationTransform] = PrivateAttr(default_factory=dict)
+    _normalizers: dict[str, Normalizer] = PrivateAttr(default_factory=dict)
+    _input_dtypes: dict[str, np.dtype | torch.dtype] = PrivateAttr(default_factory=dict)
+
+    # Model constants
+    _DEFAULT_MIN_MAX_STATISTICS: ClassVar[dict] = {
+        "rotation_6d": {
+            "min": [-1, -1, -1, -1, -1, -1],
+            "max": [1, 1, 1, 1, 1, 1],
+        },
+        "euler_angles": {
+            "min": [-np.pi, -np.pi, -np.pi],
+            "max": [np.pi, np.pi, np.pi],
+        },
+        "quaternion": {
+            "min": [-1, -1, -1, -1],
+            "max": [1, 1, 1, 1],
+        },
+        "axis_angle": {
+            "min": [-np.pi, -np.pi, -np.pi],
+            "max": [np.pi, np.pi, np.pi],
+        },
+    }
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {"apply_to", "normalization_modes", "target_rotations"}
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    @field_validator("modality_metadata", mode="before")
+    def validate_modality_metadata(cls, v):
+        for modality_key, config in v.items():
+            if isinstance(config, dict):
+                config = StateActionMetadata.model_validate(config)
+            else:
+                assert isinstance(
+                    config, StateActionMetadata
+                ), f"Invalid source rotation config: {config}"
+            v[modality_key] = config
+        return v
+
+    @model_validator(mode="after")
+    def validate_normalization_statistics(self):
+        for modality_key, normalization_statistics in self.normalization_statistics.items():
+            if modality_key in self.normalization_modes:
+                normalization_mode = self.normalization_modes[modality_key]
+                if normalization_mode == "min_max":
+                    assert (
+                        "min" in normalization_statistics and "max" in normalization_statistics
+                    ), f"Min and max statistics are required for min_max normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["min"]) == len(
+                        normalization_statistics["max"]
+                    ), f"Min and max statistics must have the same length, but got {normalization_statistics['min']} and {normalization_statistics['max']}"
+                elif normalization_mode == "mean_std":
+                    assert (
+                        "mean" in normalization_statistics and "std" in normalization_statistics
+                    ), f"Mean and std statistics are required for mean_std normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["mean"]) == len(
+                        normalization_statistics["std"]
+                    ), f"Mean and std statistics must have the same length, but got {normalization_statistics['mean']} and {normalization_statistics['std']}"
+                elif normalization_mode == "q99":
+                    assert (
+                        "q01" in normalization_statistics and "q99" in normalization_statistics
+                    ), f"q01 and q99 statistics are required for q99 normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["q01"]) == len(
+                        normalization_statistics["q99"]
+                    ), f"q01 and q99 statistics must have the same length, but got {normalization_statistics['q01']} and {normalization_statistics['q99']}"
+                elif normalization_mode == "binary":
+                    assert (
+                        len(normalization_statistics) == 1
+                    ), f"Binary normalization should only have one value, but got {normalization_statistics}"
+                    assert normalization_statistics[0] in [
+                        0,
+                        1,
+                    ], f"Binary normalization should only have 0 or 1, but got {normalization_statistics[0]}"
+                else:
+                    raise ValueError(f"Invalid normalization mode: {normalization_mode}")
+        return self
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        dataset_statistics = dataset_metadata.statistics
+        modality_metadata = dataset_metadata.modalities
+
+        # Check that all state keys specified in apply_to have their modality_metadata
+        for key in self.apply_to:
+            split_key = key.split(".", 1)
+            assert len(split_key) == 2, "State keys should have two parts: 'modality.key'"
+            if key not in self.modality_metadata:
+                modality, state_key = split_key
+                assert hasattr(modality_metadata, modality), f"{modality} config not found"
+                assert state_key in getattr(
+                    modality_metadata, modality
+                ), f"{state_key} config not found"
+                self.modality_metadata[key] = getattr(modality_metadata, modality)[state_key]
+
+        # Check that all state keys specified in normalization_modes have their statistics in state_statistics
+        for key in self.normalization_modes:
+            split_key = key.split(".", 1)
+            assert len(split_key) == 2, "State keys should have two parts: 'modality.key'"
+            modality, state_key = split_key
+            assert hasattr(dataset_statistics, modality), f"{modality} statistics not found"
+            assert state_key in getattr(
+                dataset_statistics, modality
+            ), f"{state_key} statistics not found"
+            assert (
+                len(getattr(modality_metadata, modality)[state_key].shape) == 1
+            ), f"{getattr(modality_metadata, modality)[state_key].shape=}"
+            self.normalization_statistics[key] = getattr(dataset_statistics, modality)[
+                state_key
+            ].model_dump()
+
+        # Initialize the rotation transformers
+        for key in self.target_rotations:
+            # Get the original representation of the state
+            from_rep = self.modality_metadata[key].rotation_type
+            assert from_rep is not None, f"Source rotation type not found for {key}"
+
+            # Get the target representation of the state, will raise an error if the target representation is not valid
+            to_rep = RotationType(self.target_rotations[key])
+
+            # If the original representation is not the same as the target representation, initialize the rotation transformer
+            if from_rep != to_rep:
+                self._rotation_transformers[key] = RotationTransform(
+                    from_rep=from_rep.value, to_rep=to_rep.value
+                )
+
+        # Initialize the normalizers
+        for key in self.normalization_modes:
+            modality, state_key = key.split(".", 1)
+            # If the state has a nontrivial rotation, we need to handle it more carefully
+            # For absolute rotations, we need to convert them to the target representation and normalize them using min_max mode,
+            # since we can infer the bounds by the representation
+            # For relative rotations, we cannot normalize them as we don't know the bounds
+            if key in self._rotation_transformers:
+                # Case 1: Absolute rotation
+                if self.modality_metadata[key].absolute:
+                    # Check that the normalization mode is valid
+                    assert (
+                        self.normalization_modes[key] == "min_max"
+                    ), "Absolute rotations that are converted to other formats must be normalized using `min_max` mode"
+                    rotation_type = RotationType(self.target_rotations[key]).value
+                    # If the target representation is euler angles, we need to parse the convention
+                    if rotation_type.startswith("euler_angles"):
+                        rotation_type = "euler_angles"
+                    # Get the statistics for the target representation
+                    statistics = self._DEFAULT_MIN_MAX_STATISTICS[rotation_type]
+                # Case 2: Relative rotation
+                else:
+                    raise ValueError(
+                        f"Cannot normalize relative rotations: {key} that's converted to {self.target_rotations[key]}"
+                    )
+            # If the state is not continuous, we should not use normalization modes other than binary
+            elif (
+                not self.modality_metadata[key].continuous
+                and self.normalization_modes[key] != "binary"
+            ):
+                raise ValueError(
+                    f"{key} is not continuous, so it should be normalized using `binary` mode"
+                )
+            # Initialize the normalizer
+            else:
+                statistics = self.normalization_statistics[key]
+            self._normalizers[key] = Normalizer(
+                mode=self.normalization_modes[key], statistics=statistics
+            )
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                # We allow some keys to be missing in the data, and only process the keys that are present
+                continue
+            if key not in self._input_dtypes:
+                input_dtype = data[key].dtype
+                assert isinstance(
+                    input_dtype, torch.dtype
+                ), f"Unexpected input dtype: {input_dtype}. Expected type: {torch.dtype}"
+                self._input_dtypes[key] = input_dtype
+            else:
+                assert (
+                    data[key].dtype == self._input_dtypes[key]
+                ), f"All states corresponding to the same key must be of the same dtype, input dtype: {data[key].dtype}, expected dtype: {self._input_dtypes[key]}"
+            # Rotate the state
+            state = data[key]
+            if key in self._rotation_transformers:
+                state = self._rotation_transformers[key].forward(state)
+            # Normalize the state
+            if key in self._normalizers:
+                state = self._normalizers[key].forward(state)
+            data[key] = state
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            state = data[key]
+            assert isinstance(
+                state, torch.Tensor
+            ), f"Unexpected state type: {type(state)}. Expected type: {torch.Tensor}"
+            # Unnormalize the state
+            if key in self._normalizers:
+                state = self._normalizers[key].inverse(state)
+            # Change the state back to its original representation
+            if key in self._rotation_transformers:
+                state = self._rotation_transformers[key].inverse(state)
+            assert isinstance(
+                state, torch.Tensor
+            ), f"State should be tensor after unapplying transformations, but got {type(state)}"
+            # Only convert back to the original dtype if it's known, i.e. `apply` was called before
+            # If not, we don't know the original dtype, so we don't convert
+            if key in self._input_dtypes:
+                original_dtype = self._input_dtypes[key]
+                if isinstance(original_dtype, np.dtype):
+                    state = state.numpy().astype(original_dtype)
+                elif isinstance(original_dtype, torch.dtype):
+                    state = state.to(original_dtype)
+                else:
+                    raise ValueError(f"Invalid input dtype: {original_dtype}")
+            data[key] = state
+        return data
+
+
+class StateActionPerturbation(ModalityTransform):
+    """
+    Class for state or action perturbation.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        std (float): Standard deviation of the noise to be added to the state or action.
+    """
+
+    # Configurable attributes
+    std: float = Field(
+        ..., description="Standard deviation of the noise to be added to the state or action."
+    )
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if not self.training:
+            # Don't perturb the data in eval mode
+            return data
+        if self.std < 0:
+            # If the std is negative, we don't add any noise
+            return data
+        for key in self.apply_to:
+            state = data[key]
+            assert isinstance(state, torch.Tensor)
+            transformed_data_min = torch.min(state)
+            transformed_data_max = torch.max(state)
+            noise = torch.randn_like(state) * self.std
+            state += noise
+            # Clip to the original range
+            state = torch.clamp(state, transformed_data_min, transformed_data_max)
+            data[key] = state
+        return data
+
+
+class StateActionDropout(ModalityTransform):
+    """
+    Class for state or action dropout.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        dropout_prob (float): Probability of dropping out a state or action.
+    """
+
+    # Configurable attributes
+    dropout_prob: float = Field(..., description="Probability of dropping out a state or action.")
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if not self.training:
+            # Don't drop out the data in eval mode
+            return data
+        if self.dropout_prob < 0:
+            # If the dropout probability is negative, we don't drop out any states
+            return data
+        if self.dropout_prob > 1e-9 and random.random() < self.dropout_prob:
+            for key in self.apply_to:
+                state = data[key]
+                assert isinstance(state, torch.Tensor)
+                state = torch.zeros_like(state)
+                data[key] = state
+        return data
+
+
+class StateActionSinCosTransform(ModalityTransform):
+    """
+    Class for state or action sin-cos transform.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+    """
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            state = data[key]
+            assert isinstance(state, torch.Tensor)
+            sin_state = torch.sin(state)
+            cos_state = torch.cos(state)
+            data[key] = torch.cat([sin_state, cos_state], dim=-1)
+        return data
diff --git a/code/dataloader/gr00t_lerobot/transform/video.py b/code/dataloader/gr00t_lerobot/transform/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..15310f697259d7ee6ed8eac7e5abaca211b81dc9
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/transform/video.py
@@ -0,0 +1,612 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, ClassVar, Literal
+
+import albumentations as A
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2 as T
+from einops import rearrange
+from pydantic import Field, PrivateAttr, field_validator
+from PIL import Image
+
+from ..schema import DatasetMetadata
+from .base import ModalityTransform
+
+
+class VideoTransform(ModalityTransform):
+    # Configurable attributes
+    backend: str = Field(
+        default="torchvision", description="The backend to use for the transformations"
+    )
+
+    # Model variables
+    _train_transform: Callable | None = PrivateAttr(default=None)
+    _eval_transform: Callable | None = PrivateAttr(default=None)
+    _original_resolutions: dict[str, tuple[int, int]] = PrivateAttr(default_factory=dict)
+
+    # Model constants
+    _INTERPOLATION_MAP: ClassVar[dict[str, dict[str, Any]]] = PrivateAttr(
+        {
+            "nearest": {
+                "albumentations": cv2.INTER_NEAREST,
+                "torchvision": T.InterpolationMode.NEAREST,
+            },
+            "linear": {
+                "albumentations": cv2.INTER_LINEAR,
+                "torchvision": T.InterpolationMode.BILINEAR,
+            },
+            "cubic": {
+                "albumentations": cv2.INTER_CUBIC,
+                "torchvision": T.InterpolationMode.BICUBIC,
+            },
+            "area": {
+                "albumentations": cv2.INTER_AREA,
+                "torchvision": None,  # Torchvision does not support this interpolation mode
+            },
+            "lanczos4": {
+                "albumentations": cv2.INTER_LANCZOS4,  # Lanczos with a 4x4 filter
+                "torchvision": T.InterpolationMode.LANCZOS,  # Torchvision does not specify filter size, might be different from 4x4
+            },
+            "linear_exact": {
+                "albumentations": cv2.INTER_LINEAR_EXACT,
+                "torchvision": None,  # Torchvision does not support this interpolation mode
+            },
+            "nearest_exact": {
+                "albumentations": cv2.INTER_NEAREST_EXACT,
+                "torchvision": T.InterpolationMode.NEAREST_EXACT,
+            },
+            "max": {
+                "albumentations": cv2.INTER_MAX,
+                "torchvision": None,
+            },
+        }
+    )
+
+    @property
+    def train_transform(self) -> Callable:
+        assert (
+            self._train_transform is not None
+        ), "Transform is not set. Please call set_metadata() before calling apply()."
+        return self._train_transform
+
+    @train_transform.setter
+    def train_transform(self, value: Callable):
+        self._train_transform = value
+
+    @property
+    def eval_transform(self) -> Callable | None:
+        return self._eval_transform
+
+    @eval_transform.setter
+    def eval_transform(self, value: Callable | None):
+        self._eval_transform = value
+
+    @property
+    def original_resolutions(self) -> dict[str, tuple[int, int]]:
+        assert (
+            self._original_resolutions is not None
+        ), "Original resolutions are not set. Please call set_metadata() before calling apply()."
+        return self._original_resolutions
+
+    @original_resolutions.setter
+    def original_resolutions(self, value: dict[str, tuple[int, int]]):
+        self._original_resolutions = value
+
+    def check_input(self, data: dict[str, Any]):
+        if self.backend == "torchvision":
+            for key in self.apply_to:
+                assert isinstance(data[key], torch.Tensor), f"Video {key} is not a torch tensor"
+                assert data[key].ndim in [
+                    4,
+                    5,
+                ], f"Expected video {key} to have 4 or 5 dimensions (T, C, H, W or T, B, C, H, W), got {data[key].ndim}"
+        elif self.backend == "albumentations":
+            for key in self.apply_to:
+                assert isinstance(data[key], np.ndarray), f"Video {key} is not a numpy array"
+                assert data[key].ndim in [
+                    4,
+                    5,
+                ], f"Expected video {key} to have 4 or 5 dimensions (T, C, H, W or T, B, C, H, W), got {data[key].ndim}"
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        super().set_metadata(dataset_metadata)
+        self.original_resolutions = {}
+        for key in self.apply_to:
+            split_keys = key.split(".")
+            assert len(split_keys) == 2, f"Invalid key: {key}. Expected format: modality.key"
+            sub_key = split_keys[1]
+            if sub_key in dataset_metadata.modalities.video:
+                self.original_resolutions[key] = dataset_metadata.modalities.video[
+                    sub_key
+                ].resolution
+            else:
+                raise ValueError(
+                    f"Video key {sub_key} not found in dataset metadata. Available keys: {dataset_metadata.modalities.video.keys()}"
+                )
+        train_transform = self.get_transform(mode="train")
+        eval_transform = self.get_transform(mode="eval")
+        if self.backend == "albumentations":
+            self.train_transform = A.ReplayCompose(transforms=[train_transform])  # type: ignore
+            if eval_transform is not None:
+                self.eval_transform = A.ReplayCompose(transforms=[eval_transform])  # type: ignore
+        else:
+            assert train_transform is not None, "Train transform must be set"
+            self.train_transform = train_transform
+            self.eval_transform = eval_transform
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if self.training:
+            transform = self.train_transform
+        else:
+            transform = self.eval_transform
+            if transform is None:
+                return data
+        assert (
+            transform is not None
+        ), "Transform is not set. Please call set_metadata() before calling apply()."
+        try:
+            self.check_input(data)
+        except AssertionError as e:
+            raise ValueError(
+                f"Input data does not match the expected format for {self.__class__.__name__}: {e}"
+            ) from e
+
+        # Concatenate views
+        views = [data[key] for key in self.apply_to]
+        num_views = len(views)
+        is_batched = views[0].ndim == 5
+        bs = views[0].shape[0] if is_batched else 1
+        if isinstance(views[0], torch.Tensor):
+            views = torch.cat(views, 0)
+        elif isinstance(views[0], np.ndarray):
+            views = np.concatenate(views, 0)
+        else:
+            raise ValueError(f"Unsupported view type: {type(views[0])}")
+        if is_batched:
+            views = rearrange(views, "(v b) t c h w -> (v b t) c h w", v=num_views, b=bs)
+        # Apply the transform
+        if self.backend == "torchvision":
+            views = transform(views)
+        elif self.backend == "albumentations":
+            assert isinstance(transform, A.ReplayCompose), "Transform must be a ReplayCompose"
+            first_frame = views[0]
+            transformed = transform(image=first_frame)
+            replay_data = transformed["replay"]
+            transformed_first_frame = transformed["image"]
+
+            if len(views) > 1:
+                # Apply the same transformations to the rest of the frames
+                transformed_frames = [
+                    transform.replay(replay_data, image=frame)["image"] for frame in views[1:]
+                ]
+                # Add the first frame back
+                transformed_frames = [transformed_first_frame] + transformed_frames
+            else:
+                # If there is only one frame, just make a list with one frame
+                transformed_frames = [transformed_first_frame]
+
+            # Delete the replay data to save memory
+            del replay_data
+            views = np.stack(transformed_frames, 0)
+
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+        # Split views
+        if is_batched:
+            views = rearrange(views, "(v b t) c h w -> v b t c h w", v=num_views, b=bs)
+        else:
+            views = rearrange(views, "(v t) c h w -> v t c h w", v=num_views)
+        for key, view in zip(self.apply_to, views):
+            data[key] = view
+        return data
+
+    @classmethod
+    def _validate_interpolation(cls, interpolation: str):
+        if interpolation not in cls._INTERPOLATION_MAP:
+            raise ValueError(f"Interpolation mode {interpolation} not supported")
+
+    def _get_interpolation(self, interpolation: str, backend: str = "torchvision"):
+        """
+        Get the interpolation mode for the given backend.
+
+        Args:
+            interpolation (str): The interpolation mode.
+            backend (str): The backend to use.
+
+        Returns:
+            Any: The interpolation mode for the given backend.
+        """
+        return self._INTERPOLATION_MAP[interpolation][backend]
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        raise NotImplementedError(
+            "set_transform is not implemented for VideoTransform. Please implement this function to set the transforms."
+        )
+
+
+class VideoCrop(VideoTransform):
+    height: int | None = Field(default=None, description="The height of the input image")
+    width: int | None = Field(default=None, description="The width of the input image")
+    scale: float = Field(
+        ...,
+        description="The scale of the crop. The crop size is (width * scale, height * scale)",
+    )
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the transform for the given mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: If mode is "train", return a random crop transform. If mode is "eval", return a center crop transform.
+        """
+        # 1. Check the input resolution
+        assert (
+            len(set(self.original_resolutions.values())) == 1
+        ), f"All video keys must have the same resolution, got: {self.original_resolutions}"
+        if self.height is None:
+            assert self.width is None, "Height and width must be either both provided or both None"
+            self.width, self.height = self.original_resolutions[self.apply_to[0]]
+        else:
+            assert (
+                self.width is not None
+            ), "Height and width must be either both provided or both None"
+        # 2. Create the transform
+        size = (int(self.height * self.scale), int(self.width * self.scale))
+        if self.backend == "torchvision":
+            if mode == "train":
+                return T.RandomCrop(size)
+            elif mode == "eval":
+                return T.CenterCrop(size)
+            else:
+                raise ValueError(f"Crop mode {mode} not supported")
+        elif self.backend == "albumentations":
+            if mode == "train":
+                return A.RandomCrop(height=size[0], width=size[1], p=1)
+            elif mode == "eval":
+                return A.CenterCrop(height=size[0], width=size[1], p=1)
+            else:
+                raise ValueError(f"Crop mode {mode} not supported")
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def check_input(self, data: dict[str, Any]):
+        super().check_input(data)
+        # Check the input resolution
+        for key in self.apply_to:
+            if self.backend == "torchvision":
+                height, width = data[key].shape[-2:]
+            elif self.backend == "albumentations":
+                height, width = data[key].shape[-3:-1]
+            else:
+                raise ValueError(f"Backend {self.backend} not supported")
+            assert (
+                height == self.height and width == self.width
+            ), f"Video {key} has invalid shape {height, width}, expected {self.height, self.width}"
+
+
+class VideoResize(VideoTransform):
+    height: int = Field(..., description="The height of the resize")
+    width: int = Field(..., description="The width of the resize")
+    interpolation: str = Field(default="linear", description="The interpolation mode")
+    antialias: bool = Field(default=True, description="Whether to apply antialiasing")
+
+    @field_validator("interpolation")
+    def validate_interpolation(cls, v):
+        cls._validate_interpolation(v)
+        return v
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the resize transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The resize transform.
+        """
+        interpolation = self._get_interpolation(self.interpolation, self.backend)
+        if interpolation is None:
+            raise ValueError(
+                f"Interpolation mode {self.interpolation} not supported for torchvision"
+            )
+        if self.backend == "torchvision":
+            size = (self.height, self.width)
+            return T.Resize(size, interpolation=interpolation, antialias=self.antialias)
+        elif self.backend == "albumentations":
+            return A.Resize(
+                height=self.height,
+                width=self.width,
+                interpolation=interpolation,
+                p=1,
+            )
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomRotation(VideoTransform):
+    degrees: float | tuple[float, float] = Field(
+        ..., description="The degrees of the random rotation"
+    )
+    interpolation: str = Field("linear", description="The interpolation mode")
+
+    @field_validator("interpolation")
+    def validate_interpolation(cls, v):
+        cls._validate_interpolation(v)
+        return v
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the random rotation transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: The random rotation transform. None for eval mode.
+        """
+        if mode == "eval":
+            return None
+        interpolation = self._get_interpolation(self.interpolation, self.backend)
+        if interpolation is None:
+            raise ValueError(
+                f"Interpolation mode {self.interpolation} not supported for torchvision"
+            )
+        if self.backend == "torchvision":
+            return T.RandomRotation(self.degrees, interpolation=interpolation)  # type: ignore
+        elif self.backend == "albumentations":
+            return A.Rotate(limit=self.degrees, interpolation=interpolation, p=1)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoHorizontalFlip(VideoTransform):
+    p: float = Field(..., description="The probability of the horizontal flip")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the horizontal flip transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a horizontal flip transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomHorizontalFlip(self.p)
+        elif self.backend == "albumentations":
+            return A.HorizontalFlip(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoGrayscale(VideoTransform):
+    p: float = Field(..., description="The probability of the grayscale transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the grayscale transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a grayscale transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomGrayscale(self.p)
+        elif self.backend == "albumentations":
+            return A.ToGray(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoColorJitter(VideoTransform):
+    brightness: float | tuple[float, float] = Field(
+        ..., description="The brightness of the color jitter"
+    )
+    contrast: float | tuple[float, float] = Field(
+        ..., description="The contrast of the color jitter"
+    )
+    saturation: float | tuple[float, float] = Field(
+        ..., description="The saturation of the color jitter"
+    )
+    hue: float | tuple[float, float] = Field(..., description="The hue of the color jitter")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the color jitter transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a color jitter transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.ColorJitter(
+                brightness=self.brightness,
+                contrast=self.contrast,
+                saturation=self.saturation,
+                hue=self.hue,
+            )
+        elif self.backend == "albumentations":
+            return A.ColorJitter(
+                brightness=self.brightness,
+                contrast=self.contrast,
+                saturation=self.saturation,
+                hue=self.hue,
+                p=1,
+            )
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomGrayscale(VideoTransform):
+    p: float = Field(..., description="The probability of the grayscale transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the grayscale transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a grayscale transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomGrayscale(self.p)
+        elif self.backend == "albumentations":
+            return A.ToGray(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomPosterize(VideoTransform):
+    bits: int = Field(..., description="The number of bits to posterize the image")
+    p: float = Field(..., description="The probability of the posterize transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the posterize transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a posterize transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomPosterize(bits=self.bits, p=self.p)
+        elif self.backend == "albumentations":
+            return A.Posterize(num_bits=self.bits, p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoToTensor(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to tensor transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to tensor transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_tensor
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def check_input(self, data: dict):
+        """Check if the input data has the correct shape.
+        Expected video shape: [T, H, W, C], dtype np.uint8
+        """
+        for key in self.apply_to:
+            assert key in data, f"Key {key} not found in data. Available keys: {data.keys()}"
+            assert data[key].ndim in [
+                4,
+                5,
+            ], f"Video {key} must have 4 or 5 dimensions, got {data[key].ndim}"
+            assert (
+                data[key].dtype == np.uint8
+            ), f"Video {key} must have dtype uint8, got {data[key].dtype}"
+            input_resolution = data[key].shape[-3:-1][::-1]
+            if key in self.original_resolutions:
+                expected_resolution = self.original_resolutions[key]
+            else:
+                expected_resolution = input_resolution
+            assert (
+                input_resolution == expected_resolution
+            ), f"Video {key} has invalid resolution {input_resolution}, expected {expected_resolution}. Full shape: {data[key].shape}"
+
+    @staticmethod
+    def to_tensor(frames: np.ndarray) -> torch.Tensor:
+        """Convert numpy array to tensor efficiently.
+
+        Args:
+            frames: numpy array of shape [T, H, W, C] in uint8 format
+        Returns:
+            tensor of shape [T, C, H, W] in range [0, 1]
+        """
+        frames_tensor = torch.from_numpy(frames).to(torch.float32) / 255.0
+        return frames_tensor.permute(0, 3, 1, 2)  # [T, C, H, W]
+
+
+class VideoToNumpy(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to numpy transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to numpy transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_numpy
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    @staticmethod
+    def to_numpy(frames: torch.Tensor) -> np.ndarray:
+        """Convert tensor back to numpy array efficiently.
+
+        Args:
+            frames: tensor of shape [T, C, H, W] in range [0, 1]
+        Returns:
+            numpy array of shape [T, H, W, C] in uint8 format
+        """
+        return (frames.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy()
+
+class VideoToPIL(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to PIL transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to PIL transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_pil
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    @staticmethod
+    def to_pil(frames: torch.Tensor) -> Image.Image:
+        """Convert tensor back to PIL Image.
+
+        Args:
+            frames: tensor of shape [T, C, H, W] in range [0, 1]
+        Returns:
+            PIL Image of shape [T, H, W, C] in uint8 format
+        """
+        # video PIL format?
+        return Image.fromarray((frames.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy())
\ No newline at end of file
diff --git a/code/dataloader/gr00t_lerobot/video.py b/code/dataloader/gr00t_lerobot/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1bf2db18e81f223dc0f489411d42832c008f6e
--- /dev/null
+++ b/code/dataloader/gr00t_lerobot/video.py
@@ -0,0 +1,241 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import av
+import cv2
+import numpy as np
+
+import torch  # noqa: F401 # isort: skip
+import torchvision  # noqa: F401 # isort: skip
+
+# Import decord with graceful fallback
+try:
+    import decord  # noqa: F401
+
+    DECORD_AVAILABLE = True
+except ImportError:
+    DECORD_AVAILABLE = False
+
+try:
+    import torchcodec
+
+    TORCHCODEC_AVAILABLE = True
+except (ImportError, RuntimeError):
+    TORCHCODEC_AVAILABLE = False
+
+
+def get_frames_by_indices(
+    video_path: str,
+    indices: list[int] | np.ndarray,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+) -> np.ndarray:
+    if video_backend == "decord":
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        frames = vr.get_batch(indices)
+        return frames.asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        return decoder.get_frames_at(indices=indices).data.numpy()
+    elif video_backend == "opencv":
+        frames = []
+        cap = cv2.VideoCapture(video_path, **video_backend_kwargs)
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret:
+                raise ValueError(f"Unable to read frame at index {idx}")
+            frames.append(frame)
+        cap.release()
+        frames = np.array(frames)
+        return frames
+    else:
+        raise NotImplementedError
+
+
+def get_frames_by_timestamps(
+    video_path: str,
+    timestamps: list[float] | np.ndarray,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+) -> np.ndarray:
+    """Get frames from a video at specified timestamps.
+    Args:
+        video_path (str): Path to the video file.
+        timestamps (list[int] | np.ndarray): Timestamps to retrieve frames for, in seconds.
+        video_backend (str, optional): Video backend to use. Defaults to "decord".
+    Returns:
+        np.ndarray: Frames at the specified timestamps.
+    """
+    if video_backend == "decord":
+        # For some GPUs, AV format data cannot be read
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        num_frames = len(vr)
+        # Retrieve the timestamps for each frame in the video
+        frame_ts: np.ndarray = vr.get_frame_timestamp(range(num_frames))
+        # Map each requested timestamp to the closest frame index
+        # Only take the first element of the frame_ts array which corresponds to start_seconds
+        indices = np.abs(frame_ts[:, :1] - timestamps).argmin(axis=0)
+        frames = vr.get_batch(indices)
+        return frames.asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        return decoder.get_frames_played_at(seconds=timestamps).data.numpy()
+    elif video_backend == "opencv":
+        # Open the video file
+        cap = cv2.VideoCapture(video_path, **video_backend_kwargs)
+        if not cap.isOpened():
+            raise ValueError(f"Unable to open video file: {video_path}")
+        # Retrieve the total number of frames
+        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Calculate timestamps for each frame
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_ts = np.arange(num_frames) / fps
+        frame_ts = frame_ts[:, np.newaxis]  # Reshape to (num_frames, 1) for broadcasting
+        # Map each requested timestamp to the closest frame index
+        indices = np.abs(frame_ts - timestamps).argmin(axis=0)
+        frames = []
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret:
+                raise ValueError(f"Unable to read frame at index {idx}")
+            frames.append(frame)
+        cap.release()
+        frames = np.array(frames)
+        return frames
+    elif video_backend == "torchvision_av":
+        torchvision.set_video_backend("pyav")
+        loaded_frames = []
+        loaded_ts = []
+        
+        reader = None
+        try:
+            reader = torchvision.io.VideoReader(video_path, "video")
+            
+            for target_ts in timestamps:
+                # Reset reader state
+                reader.seek(target_ts, keyframes_only=True)
+                
+                closest_frame = None
+                closest_ts_diff = float('inf')
+                
+                for frame in reader:
+                    current_ts = frame["pts"]
+                    current_diff = abs(current_ts - target_ts)
+                    
+                    if closest_frame is None:
+                        closest_frame = frame
+                    
+                    if current_diff < closest_ts_diff:
+                        # Release the previous frame
+                        if closest_frame is not None:
+                            del closest_frame
+                        closest_ts_diff = current_diff
+                        closest_frame = frame
+                    else:
+                        # The time difference starts to increase, stop searching
+                        break
+                
+                if closest_frame is not None:
+                    frame_data = closest_frame["data"]
+                    if isinstance(frame_data, torch.Tensor):
+                        frame_data = frame_data.cpu().numpy()
+                    loaded_frames.append(frame_data)
+                    loaded_ts.append(closest_frame["pts"])
+                    
+                    # Immediately release frame reference
+                    del closest_frame
+                    
+        finally:
+            # Thoroughly clean resources
+            if reader is not None:
+                if hasattr(reader, '_c'):
+                    reader._c = None
+                if hasattr(reader, 'container'):
+                    reader.container.close()
+                    reader.container = None
+            # Force garbage collection
+            import gc
+            gc.collect()
+        
+        frames = np.array(loaded_frames)
+        return frames.transpose(0, 2, 3, 1)
+    else:
+        raise NotImplementedError
+
+
+def get_all_frames(
+    video_path: str,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+    resize_size: tuple[int, int] | None = None,
+) -> np.ndarray:
+    """Get all frames from a video.
+    Args:
+        video_path (str): Path to the video file.
+        video_backend (str, optional): Video backend to use. Defaults to "decord".
+        video_backend_kwargs (dict, optional): Keyword arguments for the video backend.
+        resize_size (tuple[int, int], optional): Resize size for the frames. Defaults to None.
+    """
+    if video_backend == "decord":
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        frames = vr.get_batch(range(len(vr))).asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        frames = decoder.get_frames_at(indices=range(len(decoder)))
+        return frames.data.numpy(), frames.pts_seconds.numpy()
+    elif video_backend == "pyav":
+        container = av.open(video_path)
+        frames = []
+        for frame in container.decode(video=0):
+            frame = frame.to_ndarray(format="rgb24")
+            frames.append(frame)
+        frames = np.array(frames)
+    elif video_backend == "torchvision_av":
+        # set backend and reader
+        torchvision.set_video_backend("pyav")
+        reader = torchvision.io.VideoReader(video_path, "video")
+        frames = []
+        for frame in reader:
+            frames.append(frame["data"].numpy())
+        frames = np.array(frames)
+        frames = frames.transpose(0, 2, 3, 1)
+    else:
+        raise NotImplementedError(f"Video backend {video_backend} not implemented")
+    # resize frames if specified
+    if resize_size is not None:
+        frames = [cv2.resize(frame, resize_size) for frame in frames]
+        frames = np.array(frames)
+    return frames
\ No newline at end of file
diff --git a/code/dataloader/lerobot_datasets.py b/code/dataloader/lerobot_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..954d9bdeaaa1d88328d5bc8b5bd4716fce1903e9
--- /dev/null
+++ b/code/dataloader/lerobot_datasets.py
@@ -0,0 +1,145 @@
+# Copyright 2025 NVIDIA Corp. and affiliates. All rights reserved.
+# Modified by [Fangjing Wang/ SUST University] in [2025]. 
+# Modification: [return raw data and suport multi-dataset mixture].
+# Modified by [Jinhui YE/ HKUST University] in [2025]. 
+# Modification: [suport topdowm processing, suport param from config].
+
+from pathlib import Path
+from typing import Sequence
+from omegaconf import OmegaConf
+
+from starVLA.dataloader.gr00t_lerobot.datasets import LeRobotSingleDataset, LeRobotMixtureDataset
+from starVLA.dataloader.gr00t_lerobot.mixtures import DATASET_NAMED_MIXTURES
+from starVLA.dataloader.gr00t_lerobot.data_config import get_robot_type_config_map
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import ROBOT_TYPE_TO_EMBODIMENT_TAG, EmbodimentTag
+
+def collate_fn(batch):
+    return batch
+
+def make_LeRobotSingleDataset(
+    data_root_dir: Path | str,
+    data_name: str,
+    robot_type: str,
+    delete_pause_frame: bool = False,
+    data_cfg: dict | None = None,
+) -> LeRobotSingleDataset:
+    """
+    Make a LeRobotSingleDataset object.
+
+    :param data_root_dir: The root directory of the dataset.
+    :param data_name: The name of the dataset.
+    :param robot_type: The robot type config to use.
+    :param crop_obs_camera: Whether to crop the observation camera images.
+    :return: A LeRobotSingleDataset object.
+    """
+    chunk_size = data_cfg.get("chunk_size")
+    state_use_action_chunk = data_cfg.get("state_use_action_chunk")
+    num_history_steps = data_cfg.get("num_history_steps", 0)
+    data_config = get_robot_type_config_map(
+        chunk_size=chunk_size,
+        state_use_action_chunk=state_use_action_chunk,
+        num_history_steps=num_history_steps,
+    )[robot_type]
+    modality_config = data_config.modality_config()
+    transforms = data_config.transform()
+    dataset_path = data_root_dir / data_name
+    if robot_type not in ROBOT_TYPE_TO_EMBODIMENT_TAG:
+        print(f"Warning: Robot type {robot_type} not found in ROBOT_TYPE_TO_EMBODIMENT_TAG, using {EmbodimentTag.NEW_EMBODIMENT} as default")
+        embodiment_tag = EmbodimentTag.NEW_EMBODIMENT
+    else:
+        embodiment_tag = ROBOT_TYPE_TO_EMBODIMENT_TAG[robot_type]
+    
+    video_backend = data_cfg.get("video_backend", "decord") if data_cfg else "decord"
+    
+    return LeRobotSingleDataset(
+        dataset_path=dataset_path,
+        modality_configs=modality_config,
+        transforms=transforms,
+        embodiment_tag=embodiment_tag,
+        video_backend=video_backend, # decord is more efficiency | torchvision_av for video.av1
+        delete_pause_frame=delete_pause_frame,
+        data_cfg=data_cfg,
+    )
+
+def get_vla_dataset(
+    data_cfg: dict,
+    mode: str = "train",
+    balance_dataset_weights: bool = False,
+    balance_trajectory_weights: bool = False,
+    seed: int = 42,
+    delete_pause_frame: bool = True,
+    **kwargs: dict,
+) -> LeRobotMixtureDataset:
+    """
+    Get a LeRobotMixtureDataset object.
+    """
+    data_root_dir = data_cfg.data_root_dir
+    data_mix = data_cfg.data_mix
+    mixture_spec = DATASET_NAMED_MIXTURES[data_mix]
+    included_datasets, filtered_mixture_spec = set(), []
+    for d_name, d_weight, robot_type in mixture_spec:  
+        dataset_key = (d_name, robot_type)  
+        if dataset_key in included_datasets:
+            print(f"Skipping Duplicate Dataset: `{(d_name, d_weight, robot_type)}`")
+            continue
+
+        included_datasets.add(dataset_key)
+        filtered_mixture_spec.append((d_name, d_weight, robot_type))
+
+    dataset_mixture = []
+    for d_name, d_weight, robot_type in filtered_mixture_spec:
+        dataset_mixture.append((make_LeRobotSingleDataset(Path(data_root_dir), d_name, robot_type, delete_pause_frame=delete_pause_frame, data_cfg=data_cfg), d_weight))
+
+    return LeRobotMixtureDataset(
+        dataset_mixture,
+        mode=mode,
+        balance_dataset_weights=balance_dataset_weights,
+        balance_trajectory_weights=balance_trajectory_weights,
+        seed=seed,
+        data_cfg=data_cfg,
+        **kwargs,
+    )
+
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_behavior.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    args.config_yaml = "examples/LIBERO/train_files/starvla_cotrain_libero.yaml"
+    cfg = OmegaConf.load(args.config_yaml)
+
+    vla_dataset_cfg = cfg.datasets.vla_data
+    # vla_dataset_cfg.data_root_dir = "./playground/Datasets/behavior-1k"
+    # vla_dataset_cfg.include_state = True
+    # vla_dataset_cfg.data_mix = "BEHAVIOR_dual_base_depth"
+    vla_dataset_cfg.task_id = 1
+    for task_id in ["all"]:
+        # 11,26,36,37
+        # 5,11,13,26,36,27,43,44,45,46
+        # 2,3,5,11,13,25,26,27,
+        # 3,5,11,13, / 14,15,16,17, / 19,20,23,25, / 26,27,30,34, / 36,37,38,39, 41,42,43,44,45,46,47,49
+        vla_dataset_cfg.task_id = task_id
+        print(f"Testing Task ID: {task_id}")
+        dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+        # dataset
+    from torch.utils.data import DataLoader
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1, # For Debug
+        collate_fn=collate_fn,
+    )
+
+    from tqdm import tqdm
+    count = 1
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        # print(batch)
+        # print(1)
+        if count > 1:
+            break
+        count += 1
+        pass
\ No newline at end of file
diff --git a/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc b/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0343b52aeb3caf567010067354f61e471a4089e0
Binary files /dev/null and b/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc differ
diff --git a/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-311.pyc b/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c4cab9e5e946a8565b9cf4e0c11e995bbc1c1f6
Binary files /dev/null and b/code/dataloader/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-311.pyc differ
diff --git a/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc b/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84b233b414798b0b9d42e91c2797d4e28ef148c1
Binary files /dev/null and b/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc differ
diff --git a/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-311.pyc b/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9bb02c6ae52ef35de67ed7b90531aba9a5f16c30
Binary files /dev/null and b/code/dataloader/qwenvl_llavajson/__pycache__/rope2d.cpython-311.pyc differ
diff --git a/code/dataloader/qwenvl_llavajson/qwen_data_config.py b/code/dataloader/qwenvl_llavajson/qwen_data_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c690f7ba14e9b6ec5f5bfaab310abaf867505b96
--- /dev/null
+++ b/code/dataloader/qwenvl_llavajson/qwen_data_config.py
@@ -0,0 +1,44 @@
+import re
+
+from pathlib import Path
+
+# You can add multimodal datasets here and register a short nickname to ${data_dict}.
+# The data format should follow the general multimodal VLM format, for example:
+# https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-finetune/README.md
+
+json_root = f"./playground/Datasets/LLaVA-OneVision-COCO/llava_jsons"
+image_root = f"./playground/Datasets/LLaVA-OneVision-COCO/images"
+
+SHAREGPT4V_COCO = {
+    "annotation_path": f"{json_root}/sharegpt4v_coco.json",
+    "data_path": f"{image_root}/",
+}
+
+data_dict = {
+    "sharegpt4v_coco": SHAREGPT4V_COCO,
+}
+
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+def data_list(dataset_names):
+    if dataset_names == ["all"]:
+        dataset_names = list(data_dict.keys())
+    config_list = []
+    for dataset_name in dataset_names:
+        sampling_rate = parse_sampling_rate(dataset_name)
+        dataset_name = re.sub(r"%(\d+)$", "", dataset_name)
+        if dataset_name in data_dict.keys():
+            config = data_dict[dataset_name].copy()
+            config["sampling_rate"] = sampling_rate
+            config_list.append(config)
+        else:
+            raise ValueError(f"do not find {dataset_name}")
+    return config_list
+
+if __name__ == "__main__":
+    print(data_list)
+    
diff --git a/code/dataloader/qwenvl_llavajson/rope2d.py b/code/dataloader/qwenvl_llavajson/rope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4da85174dcc7da97bc27264858965628773296
--- /dev/null
+++ b/code/dataloader/qwenvl_llavajson/rope2d.py
@@ -0,0 +1,351 @@
+import os
+import copy
+import json
+import random
+import logging
+import re
+import time
+import math
+import ast
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List, Tuple
+from io import BytesIO
+import base64
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from decord import VideoReader
+import transformers
+
+
+def get_rope_index_25(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embedding for text part.
+        Examples:
+            Temporal (Time): 3 patches, representing different segments of the video in time.
+            Height: 2 patches, dividing each frame vertically.
+            Width: 2 patches, dividing each frame horizontally.
+            We also have some important parameters:
+            fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+            tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+            temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+            interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [101, 102, 103, 104, 105]
+            text height position_ids: [101, 102, 103, 104, 105]
+            text width position_ids: [101, 102, 103, 104, 105]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+            The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        attention_mask = attention_mask.to(total_input_ids.device)
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    second_per_grid_t = 0
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    if second_per_grid_ts is not None:
+                        second_per_grid_t = second_per_grid_ts[video_index]
+                    else:
+                        second_per_grid_t = 1.0
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+
+                time_tensor = expanded_range * second_per_grid_t * 2
+
+                time_tensor_long = time_tensor.long()
+                t_index = time_tensor_long.flatten()
+
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_2(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embeddin for text part.
+        Examples:
+            Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [3, 4, 5, 6, 7]
+            text height position_ids: [3, 4, 5, 6, 7]
+            text width position_ids: [3, 4, 5, 6, 7]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
diff --git a/code/dataloader/vlm_datasets.py b/code/dataloader/vlm_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..422aeed3ec5352f8e85e4ab64e19747c24ef8aa8
--- /dev/null
+++ b/code/dataloader/vlm_datasets.py
@@ -0,0 +1,658 @@
+import os
+import copy
+import json
+import random
+import logging
+import re
+import time
+import math
+import itertools
+import ast
+from dataclasses import dataclass
+from typing import Dict, Optional, Sequence, List, Tuple
+from io import BytesIO
+import base64
+from collections.abc import Sequence
+from types import SimpleNamespace
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from decord import VideoReader
+import transformers
+from omegaconf import OmegaConf
+from starVLA.dataloader.qwenvl_llavajson.qwen_data_config import data_list
+from starVLA.dataloader.qwenvl_llavajson.rope2d import get_rope_index_25, get_rope_index_2
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>\n"
+DEFAULT_VIDEO_TOKEN = "<video>\n"
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
+def preprocess_qwen_2_visual(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    grid_thw: List = [],
+    visual_type: str = "image",
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    system_message = "You are a helpful assistant."
+    if visual_type not in ["image", "video"]:
+        raise ValueError("visual_type must be either 'image' or 'video'")
+
+    tokenizer = copy.deepcopy(tokenizer)
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+
+    visual_replicate_index = 0
+    input_ids, targets = [], []
+
+    for i, source in enumerate(sources):
+        try:
+            if roles[source[0]["from"]] != roles["human"]:
+                source = source[1:]
+        except:
+            print(sources)
+
+        input_id, target = [], []
+
+        input_id += tokenizer.apply_chat_template([{"role": "system", "content": system_message}])
+        target += [IGNORE_INDEX] * len(input_id)
+
+        for conv in source:
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+
+            role = roles.get(role, role)
+            if role == "user":
+                visual_tag = f"<{visual_type}>"
+                if visual_tag in content:
+                    parts = content.split(visual_tag)
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + f"<|{visual_type}_pad|>" * grid_thw[visual_replicate_index]
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target_mask = encode_id.copy()
+                target_mask[:3] = [IGNORE_INDEX] * 3
+                target += target_mask
+
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        input_ids.append(input_id)
+        targets.append(target)
+
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset = data_args.dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(data_args, "video_max_total_pixels", 1664 * 28 * 28)
+        self.video_min_total_pixels = getattr(data_args, "video_min_total_pixels", 256 * 28 * 28)
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+
+        list_data_dict = []
+
+        for data in dataset_list:
+            file_format = data["annotation_path"].split(".")[-1]
+            if file_format == "jsonl":
+                annotations = read_jsonl(data["annotation_path"])
+            else:
+                annotations = json.load(open(data["annotation_path"], "r"))
+            sampling_rate = data.get("sampling_rate", 1.0)
+            if sampling_rate < 1.0:
+                annotations = random.sample(annotations, int(len(annotations) * sampling_rate))
+                print(f"sampling {len(annotations)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+            for ann in annotations:
+                if data["data_path"] != "":
+                    ann["data_path"] = data["data_path"]
+                elif "raw_data" in ann.keys():
+                    ann["data_path"] = ann["raw_data"]["data_root"]
+            list_data_dict += annotations
+
+        list_data_dict = self.pre_filter_long_case(list_data_dict, max_words=tokenizer.max_len_single_sentence)
+        random.shuffle(list_data_dict)  # Randomly shuffle the data for training
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+        rank0_print(f"Total training samples: {len(self.list_data_dict)}")
+        rank0_print("Formatting inputs...Skip in lazy mode")
+
+        # self.data_args.image_processor.max_pixels = data_args.max_pixels
+        # self.data_args.image_processor.min_pixels = data_args.min_pixels
+        # self.data_args.image_processor.size["longest_edge"] = data_args.max_pixels
+        # self.data_args.image_processor.size["shortest_edge"] = data_args.min_pixels
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    def pre_filter_long_case(self, list_data_dict, max_words=1024):
+        """filter out conversations with total words exceeding max_words"""
+
+        def count_total_words(convs):
+            total = 0
+            for entry in convs:
+                value = entry.get("value", "")
+                total += len(value.strip().split())
+            return total
+
+        return [item for item in list_data_dict if count_total_words(item.get("conversations", [])) <= max_words]
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("images" in sample) or ("videos" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    @property
+    def pre_calculated_length(self):
+        if "num_tokens" in self.list_data_dict[0]:
+            length_list = [sample["num_tokens"] for sample in self.list_data_dict]
+            return np.array(length_list)
+        else:
+            print("No pre-calculated length available.")
+            return np.array([1] * len(self.list_data_dict))
+
+    def process_image_unified(self, image_file):
+        processor = copy.deepcopy(self.data_args.image_processor)
+        image = Image.open(image_file).convert("RGB")
+        # if fix image size?
+        if getattr(self.data_args, "fix_image_size", None) is not None:
+            image = image.resize(
+                self.data_args.fix_image_size,
+                resample=Image.BICUBIC,
+            )
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+
+    def process_video(self, video_file):
+        if not os.path.exists(video_file):
+            print(f"File not exist: {video_file}")
+        vr = VideoReader(video_file, num_threads=4)
+        total_frames = len(vr)
+        avg_fps = vr.get_avg_fps()
+        video_length = total_frames / avg_fps
+        interval = getattr(self.data_args, "base_interval", 4)
+
+        num_frames_to_sample = round(video_length / interval)
+        video_min_frames = getattr(self.data_args, "video_min_frames", 4)
+        video_max_frames = getattr(self.data_args, "video_max_frames", 8)
+
+        target_frames = min(max(num_frames_to_sample, video_min_frames), video_max_frames)
+        frame_idx = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+        frame_idx = np.unique(frame_idx)
+        video = vr.get_batch(frame_idx).asnumpy()
+        fps = len(frame_idx) / video_length
+        processor = copy.deepcopy(self.data_args.image_processor)
+        processor.max_pixels = self.data_args.video_max_frame_pixels
+        processor.min_pixels = self.data_args.video_min_frame_pixels
+        processor.size["longest_edge"] = processor.max_pixels
+        processor.size["shortest_edge"] = processor.min_pixels
+        video_processed = processor.preprocess(images=None, videos=video, return_tensors="pt")
+        video_tensor = video_processed["pixel_values_videos"]
+        grid_thw = video_processed["video_grid_thw"][0]
+        second_per_grid_ts = [self.data_args.image_processor.temporal_patch_size / fps] * len(grid_thw)
+        return video_tensor, grid_thw, second_per_grid_ts
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        num_base_retries = 3
+        num_final_retries = 30
+
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(
+                    f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:",
+                    e,
+                )
+                pass
+
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        video = None
+        if "images" in sources[0] and len(sources[0]["images"]):
+            image_folder = self.list_data_dict[i]["data_path"]
+            image_file = self.list_data_dict[i]["images"]
+            if isinstance(image_file, List):
+                if len(image_file) > 1:
+                    image_file = [os.path.join(image_folder, file) for file in image_file]
+                    results = [self.process_image_unified(file) for file in image_file]
+                    image, grid_thw = zip(*results)
+                else:
+                    image_file = image_file[0]
+                    image_file = os.path.join(image_folder, image_file)
+                    image, grid_thw = self.process_image_unified(image_file)
+                    image = [image]
+            else:
+                image_file = os.path.join(image_folder, image_file)
+                image, grid_thw = self.process_image_unified(image_file)
+                image = [image]
+            grid_thw_merged = copy.deepcopy(grid_thw)
+            if not isinstance(grid_thw, Sequence):
+                grid_thw_merged = [grid_thw_merged]
+                grid_thw = [grid_thw]
+            grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+            ]
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged, visual_type="image")
+            position_ids, _ = self.get_rope_index(
+                self.data_args.image_processor.merge_size,
+                data_dict["input_ids"],
+                torch.stack(grid_thw, dim=0),  # (1,16,16)
+            )
+        elif "videos" in sources[0] and len(sources[0]["videos"]):
+            video_file = self.list_data_dict[i]["videos"]
+            video_folder = self.list_data_dict[i]["data_path"]
+            if isinstance(video_file, List):
+                if len(video_file) > 1:
+                    video_file = [os.path.join(video_folder, file) for file in video_file]
+                    results = [self.process_video(file) for file in video_file]
+                    video, grid_thw, second_per_grid_ts = zip(*results)
+                else:
+                    video_file = video_file[0]
+                    video_file = os.path.join(video_folder, video_file)
+                    video, grid_thw, second_per_grid_ts = self.process_video(video_file)
+                    video = [video]
+            else:
+                video_file = os.path.join(video_folder, video_file)
+                video, grid_thw, second_per_grid_ts = self.process_video(video_file)
+                video = [video]
+            grid_thw_merged = copy.deepcopy(grid_thw)
+            if not isinstance(grid_thw, Sequence):
+                grid_thw_merged = [grid_thw_merged]
+                grid_thw = [grid_thw]
+            grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+            ]
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged, visual_type="video")
+            position_ids, _ = self.get_rope_index(
+                self.data_args.image_processor.merge_size,
+                data_dict["input_ids"],
+                video_grid_thw=torch.stack(grid_thw, dim=0),
+                second_per_grid_ts=second_per_grid_ts,
+            )
+        else:
+            grid_thw_merged = None
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged)
+            position_ids = torch.arange(0, data_dict["input_ids"].size(1)).view(1, -1).unsqueeze(0).expand(3, -1, -1)
+
+        if isinstance(i, int):
+            data_dict = dict(
+                input_ids=data_dict["input_ids"][0],
+                labels=data_dict["labels"][0],
+                position_ids=position_ids,
+            )
+        if "images" in self.list_data_dict[i]:
+            data_dict["pixel_values"] = image
+            data_dict["image_grid_thw"] = grid_thw
+        # video exist in the data
+        elif "videos" in self.list_data_dict[i]:
+            data_dict["pixel_values_videos"] = video
+            data_dict["video_grid_thw"] = grid_thw
+
+        max_len = self.tokenizer.max_len_single_sentence
+        if data_dict["input_ids"].shape[0] > max_len:
+            data_dict["input_ids"] = data_dict["input_ids"][:max_len]
+            data_dict["labels"] = data_dict["labels"][:max_len]
+            data_dict["position_ids"] = position_ids[:, :, :max_len]
+
+        return data_dict
+
+
+def pad_and_cat(tensor_list):
+    max_length = max(tensor.shape[2] for tensor in tensor_list)
+
+    padded_tensors = []
+    for tensor in tensor_list:
+        pad_length = max_length - tensor.shape[2]
+        padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), "constant", 1)
+        padded_tensors.append(padded_tensor)
+
+    stacked_tensor = torch.cat(padded_tensors, dim=1)
+
+    return stacked_tensor
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels", "position_ids")
+        )
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+            padding_side=self.tokenizer.padding_side,
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX, padding_side=self.tokenizer.padding_side
+        )
+        position_ids = pad_and_cat(position_ids)
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        position_ids = position_ids[..., : self.tokenizer.model_max_length]  # 3,bs,length
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        images = list(
+            itertools.chain(*(instance["pixel_values"] for instance in instances if "pixel_values" in instance))
+        )
+        videos = list(
+            itertools.chain(
+                *(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+            )
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = list(
+                itertools.chain(*(instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance))
+            )
+            grid_thw = torch.stack(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = list(
+                itertools.chain(*(instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance))
+            )
+            video_grid_thw = torch.stack(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+        batch["position_ids"] = position_ids
+        return batch
+
+
+@dataclass
+class FlattenedDataCollatorForSupervisedDataset(DataCollatorForSupervisedDataset):
+    """Collate examples into packed sequence with multi-modal support."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels", "position_ids")
+        )
+
+        seq_lens = torch.tensor([0] + [len(seq) for seq in input_ids], dtype=torch.int32)
+        cumsum_seq_lens = torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
+        input_ids = torch.cat(input_ids, dim=0)
+        labels = torch.cat(labels, dim=0)
+        position_ids = torch.cat(position_ids, dim=2)
+
+        batch = dict(
+            input_ids=input_ids.unsqueeze(0),
+            labels=labels.unsqueeze(0),
+            attention_mask=cumsum_seq_lens,
+            position_ids=position_ids,
+        )
+        images = list(
+            itertools.chain(*(instance["pixel_values"] for instance in instances if "pixel_values" in instance))
+        )
+        videos = list(
+            itertools.chain(
+                *(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+            )
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = list(
+                itertools.chain(*(instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance))
+            )
+            grid_thw = torch.stack(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = list(
+                itertools.chain(*(instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance))
+            )
+            video_grid_thw = torch.stack(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    # load training dataset
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+
+    # load evaluation dataset (if specified eval dataset path)
+    eval_dataset = None
+    if hasattr(data_args, "eval_dataset") and data_args.eval_dataset:
+        eval_data_args = copy.deepcopy(data_args)
+        eval_data_args.dataset_use = data_args.eval_dataset
+        eval_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=eval_data_args)
+
+    # select appropriate collator based on whether data needs to be flattened
+    if data_args.data_flatten:
+        data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    else:
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+
+    return dict(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+    )
+
+
+def make_vlm_dataloader(cfg):
+    data_args = cfg.datasets.vlm_data
+    image_processor = AutoProcessor.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+    ).image_processor
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+        model_max_length=data_args.model_max_length,
+        padding_side="left",  # flash Attention version of Qwen2.5_VL. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input.
+        use_fast=False,
+    )
+
+    # avoid processing these in dataset
+    image_processor.max_pixels = int(data_args.max_pixels)
+    image_processor.min_pixels = int(data_args.min_pixels)
+    image_processor.size["longest_edge"] = int(data_args.max_pixels)
+    image_processor.size["shortest_edge"] = int(data_args.min_pixels)
+    data_args_ns = SimpleNamespace(**OmegaConf.to_container(data_args, resolve=True))
+    data_args_ns.image_processor = image_processor  # TODO later remove the logic bound to model
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args_ns)
+
+    #
+    train_dataset = data_module["train_dataset"]
+    data_collator = data_module["data_collator"]
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.datasets.vlm_data.per_device_batch_size,
+        collate_fn=data_collator,
+        num_workers=4,
+    )
+
+    return {
+        "train_dataloader": train_dataloader,
+    }
+
+
+from transformers import AutoTokenizer, AutoProcessor
+
+if __name__ == "__main__":
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./examples/LIBERO/train_files/starvla_cotrain_libero.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    
+    data_args = cfg.datasets.vlm_data
+    image_processor = AutoProcessor.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+    ).image_processor
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+        model_max_length=data_args.model_max_length,
+        padding_side="left",
+        use_fast=False,
+    )
+
+    # avoid processing these in dataset
+    image_processor.max_pixels = data_args.max_pixels
+    image_processor.min_pixels = data_args.min_pixels
+    image_processor.size["longest_edge"] = data_args.max_pixels
+    image_processor.size["shortest_edge"] = data_args.min_pixels
+
+    data_args_ns = SimpleNamespace(**OmegaConf.to_container(data_args, resolve=True))
+    data_args_ns.image_processor = image_processor
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args_ns)
+
+    #
+    train_dataset = data_module["train_dataset"]
+    data_collator = data_module["data_collator"]
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.datasets.vlm_data.per_device_batch_size,
+        collate_fn=data_collator,
+    )
+    batchs = iter(train_dataloader)
+    batch_samples = next(batchs)
+    # skip the first 99 batches, get the 100th batch
+    from itertools import islice
+
+    # batch_samples = next(islice(batchs, 99, 100))
+    count = 0
+    while count < 100:
+        batch_samples = next(batchs)  # for debug
+        print(count)
+        count += 1
+    pass
diff --git a/code/dataloader_bak/__init__.py b/code/dataloader_bak/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1870aebb219b7d28dafd434b4cdd767ebd90f166
--- /dev/null
+++ b/code/dataloader_bak/__init__.py
@@ -0,0 +1,70 @@
+import json
+from accelerate.logging import get_logger
+import numpy as np
+from torch.utils.data import DataLoader
+import torch.distributed as dist
+from pathlib import Path
+from starVLA.dataloader.vlm_datasets import make_vlm_dataloader
+
+logger = get_logger(__name__)
+
+
+def _is_main_process() -> bool:
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+def save_dataset_statistics(dataset_statistics, run_dir):
+    """Saves a `dataset_statistics.json` file."""
+    out_path = run_dir / "dataset_statistics.json"
+    with open(out_path, "w") as f_json:
+        for _, stats in dataset_statistics.items():
+            for k in stats["action"].keys():
+                if isinstance(stats["action"][k], np.ndarray):
+                    stats["action"][k] = stats["action"][k].tolist()
+            if "proprio" in stats:
+                for k in stats["proprio"].keys():
+                    if isinstance(stats["proprio"][k], np.ndarray):
+                        stats["proprio"][k] = stats["proprio"][k].tolist()
+            if "num_trajectories" in stats:
+                if isinstance(stats["num_trajectories"], np.ndarray):
+                    stats["num_trajectories"] = stats["num_trajectories"].item()
+            if "num_transitions" in stats:
+                if isinstance(stats["num_transitions"], np.ndarray):
+                    stats["num_transitions"] = stats["num_transitions"].item()
+        json.dump(dataset_statistics, f_json, indent=2)
+    logger.info(f"Saved dataset statistics file at path {out_path}")
+
+
+
+def build_dataloader(cfg, dataset_py="lerobot_datasets_oxe"): # TODO now here only is get dataset, we need mv dataloader to here
+
+    if dataset_py == "lerobot_datasets":
+        from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+        vla_dataset_cfg = cfg.datasets.vla_data
+
+        vla_dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+        
+        vla_train_dataloader = DataLoader(
+            vla_dataset,
+            batch_size=cfg.datasets.vla_data.per_device_batch_size,
+            collate_fn=collate_fn,
+            num_workers=16,
+            prefetch_factor=20,
+            shuffle=True,
+            persistent_workers=True,     # 保持 worker 存活，避免重启开销
+            pin_memory=True,             # 加速 GPU 传输
+            drop_last=True,              # 丢弃最后不完整的 batch，避免等待
+            timeout=30,                  # 设置超时，避免 worker 阻塞导致长时间等待
+        )
+        if _is_main_process():
+            output_dir = Path(cfg.output_dir)
+            vla_dataset.save_dataset_statistics(output_dir / "dataset_statistics.json")
+        return vla_train_dataloader
+    if dataset_py == "vlm_datasets":
+        vlm_data_module = make_vlm_dataloader(cfg)
+        vlm_train_dataloader = vlm_data_module["train_dataloader"]
+        return vlm_train_dataloader
+
+    raise ValueError(
+        f"Unsupported dataset builder `{dataset_py}`. "
+        "Expected one of: `lerobot_datasets`, `vlm_datasets`."
+    )
diff --git a/code/dataloader_bak/__pycache__/__init__.cpython-310.pyc b/code/dataloader_bak/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fe6363e9f643179befd76ca55687423d12b6755
Binary files /dev/null and b/code/dataloader_bak/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader_bak/__pycache__/lerobot_datasets.cpython-310.pyc b/code/dataloader_bak/__pycache__/lerobot_datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e740663f25c6ee5940ebe418b0c2a098aeac053
Binary files /dev/null and b/code/dataloader_bak/__pycache__/lerobot_datasets.cpython-310.pyc differ
diff --git a/code/dataloader_bak/__pycache__/vlm_datasets.cpython-310.pyc b/code/dataloader_bak/__pycache__/vlm_datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9af99a899d1385e6d109471908748e7bdb294e43
Binary files /dev/null and b/code/dataloader_bak/__pycache__/vlm_datasets.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/README.md b/code/dataloader_bak/gr00t_lerobot/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/dataloader_bak/gr00t_lerobot/__init__.py b/code/dataloader_bak/gr00t_lerobot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b90b023ae561868dd0b9a758ea561ecf1139210
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dc853828d7307c552cd716b5c8136de40956200
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/data_config.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1518267f8d7bbe30fda8b5ce17b5d5e4bc149bb9
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/datasets.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32e62a9ac548233aca4475c2fa02a989d0f8d93f
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/embodiment_tags.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d4db3cf4a7a1506ed412137994689ba147cf30d
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/mixtures.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/schema.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/schema.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22e876ba93a5085a3e175340d0c5761c7995f380
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/schema.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/__pycache__/video.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/__pycache__/video.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bd8b945c6cf9f84e8f432eeba3ed83407921831
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/__pycache__/video.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/data_config.py b/code/dataloader_bak/gr00t_lerobot/data_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2851f1f4c05e3110986380b73639cef374c475a
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/data_config.py
@@ -0,0 +1,367 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+
+from starVLA.dataloader.gr00t_lerobot.datasets import ModalityConfig
+from starVLA.dataloader.gr00t_lerobot.transform.base import ComposedModalityTransform, ModalityTransform
+from starVLA.dataloader.gr00t_lerobot.transform.state_action import (
+    StateActionSinCosTransform,
+    StateActionToTensor,
+    StateActionTransform,
+)
+
+
+class BaseDataConfig(ABC):
+    @abstractmethod
+    def modality_config(self) -> dict[str, ModalityConfig]:
+        pass
+
+    @abstractmethod
+    def transform(self) -> ModalityTransform:
+        pass
+
+
+###########################################################################################
+
+class Libero4in1DataConfig:
+    video_keys = [
+        "video.primary_image",
+        "video.wrist_image",
+    ]
+    
+    state_keys = [
+        "state.x",
+        "state.y",
+        "state.z",
+        "state.roll",
+        "state.pitch",
+        "state.yaw",
+        "state.pad",
+        "state.gripper",
+    ]
+    action_keys = [
+        "action.x",
+        "action.y",
+        "action.z",
+        "action.roll",
+        "action.pitch",
+        "action.yaw",
+        "action.gripper",
+    ]
+    
+    language_keys = ["annotation.human.action.task_description"]
+
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.x": "min_max",
+                    "state.y": "min_max",
+                    "state.z": "min_max",
+                    "state.roll": "min_max",
+                    "state.pitch": "min_max",
+                    "state.yaw": "min_max",
+                    "state.pad": "min_max",
+                    # "state.gripper": "binary",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.x": "min_max",
+                    "action.y": "min_max",
+                    "action.z": "min_max",
+                    "action.roll": "min_max",
+                    "action.pitch": "min_max",
+                    "action.yaw": "min_max",
+                    # "action.gripper": "binary",
+                },
+            ),
+        ]
+
+        return ComposedModalityTransform(transforms=transforms)
+
+###########################################################################################
+
+class RealWorldFrankaDataConfig:
+    """Real-world Panda robot: 7 joints + 1 gripper (8D), single-arm -> right slot [7:15]."""
+    video_keys = [
+        "video.exterior_image_1_left",
+        "video.wrist_image_left",
+    ]
+    state_keys = [
+        "state.joints",
+        "state.gripper",
+    ]
+    action_keys = [
+        "action.joints",
+        "action.gripper",
+    ]
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+    action_indices = list(range(16))
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.joints": "min_max",
+                    # "state.gripper": "binary",
+                },
+            ),
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.joints": "min_max",
+                    # "action.gripper": "binary",
+                },
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class AgilexDataConfig:
+    video_keys = [
+        "video.cam_high",
+        "video.cam_left_wrist",
+        "video.cam_right_wrist",
+    ]
+    state_keys = [
+        "state.left_joints",
+        "state.left_gripper",
+        "state.right_joints",
+        "state.right_gripper",
+    ]
+    action_keys = [
+        "action.left_joints",
+        "action.left_gripper",
+        "action.right_joints",
+        "action.right_gripper",
+    ]
+
+    language_keys = ["annotation.human.action.task_description"]
+    observation_indices = [0]
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self):
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionTransform(
+                apply_to=self.state_keys,
+                normalization_modes={
+                    "state.left_joints": "min_max",
+                    "state.left_gripper": "binary",
+                    "state.right_joints": "min_max",
+                    "state.right_gripper": "binary",
+                },
+            ),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={
+                    "action.left_joints": "min_max",
+                    "action.left_gripper": "binary",
+                    "action.right_joints": "min_max",
+                    "action.right_gripper": "binary",
+                },
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+
+class FourierGr1ArmsWaistDataConfig:
+    video_keys = ["video.ego_view"]
+    state_keys = [
+        "state.left_arm",
+        "state.right_arm",
+        "state.left_hand",
+        "state.right_hand",
+        "state.waist",
+    ]
+    action_keys = [
+        "action.left_arm",
+        "action.right_arm",
+        "action.left_hand",
+        "action.right_hand",
+        "action.waist",
+    ]
+    language_keys = ["annotation.human.coarse_action"]
+    observation_indices = [0]
+
+    def __init__(self, chunk_size: int = 16, state_use_action_chunk: bool = False):
+        self.chunk_size = chunk_size
+        self.action_indices = list(range(chunk_size))
+        self.state_use_action_chunk = state_use_action_chunk
+
+    def modality_config(self):
+        video_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.video_keys,
+        )
+        state_delta = self.action_indices if getattr(self, "state_use_action_chunk", False) else self.observation_indices
+        state_modality = ModalityConfig(
+            delta_indices=state_delta,
+            modality_keys=self.state_keys,
+        )
+        action_modality = ModalityConfig(
+            delta_indices=self.action_indices,
+            modality_keys=self.action_keys,
+        )
+        language_modality = ModalityConfig(
+            delta_indices=self.observation_indices,
+            modality_keys=self.language_keys,
+        )
+        modality_configs = {
+            "video": video_modality,
+            "state": state_modality,
+            "action": action_modality,
+            "language": language_modality,
+        }
+        return modality_configs
+
+    def transform(self) -> ModalityTransform:
+        transforms = [
+            # state transforms
+            StateActionToTensor(apply_to=self.state_keys),
+            StateActionSinCosTransform(apply_to=self.state_keys),
+            # action transforms
+            StateActionToTensor(apply_to=self.action_keys),
+            StateActionTransform(
+                apply_to=self.action_keys,
+                normalization_modes={key: "min_max" for key in self.action_keys},
+            ),
+        ]
+        return ComposedModalityTransform(transforms=transforms)
+
+###########################################################################################
+
+
+def get_robot_type_config_map(
+    chunk_size: int = 15,
+    state_use_action_chunk: bool = True,
+) -> dict[str, BaseDataConfig]:
+    """state_use_action_chunk: when True, state uses action_indices so state has shape (L, state_dim) aligned with action chunk."""
+    return {
+        "libero_franka": Libero4in1DataConfig(chunk_size=chunk_size, state_use_action_chunk=state_use_action_chunk),
+        "robotwin": AgilexDataConfig(chunk_size=chunk_size, state_use_action_chunk=state_use_action_chunk),
+        "fourier_gr1_arms_waist": FourierGr1ArmsWaistDataConfig(chunk_size=chunk_size, state_use_action_chunk=state_use_action_chunk),
+        "real_world_franka": RealWorldFrankaDataConfig(chunk_size=chunk_size, state_use_action_chunk=state_use_action_chunk),
+    }
diff --git a/code/dataloader_bak/gr00t_lerobot/datasets.py b/code/dataloader_bak/gr00t_lerobot/datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6a603bd28570a0e0adae01329486ddd63aa3996
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/datasets.py
@@ -0,0 +1,2175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+In this file, we define 3 types of datasets:
+1. LeRobotSingleDataset: a single dataset for a given embodiment tag
+2. LeRobotMixtureDataset: a mixture of datasets for a given list of embodiment tags
+3. CachedLeRobotSingleDataset: a single dataset for a given embodiment tag,
+                                with caching for the video frames
+
+See `scripts/load_dataset.py` for examples on how to use these datasets.
+"""
+import os
+import hashlib
+import json, torch
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import os, random
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from PIL import Image
+
+from starVLA.dataloader.gr00t_lerobot.video import get_all_frames, get_frames_by_timestamps
+
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import EmbodimentTag, DATASET_NAME_TO_ID
+from starVLA.dataloader.gr00t_lerobot.schema import (
+    DatasetMetadata,
+    DatasetStatisticalValues,
+    LeRobotModalityMetadata,
+    LeRobotStateActionMetadata,
+)
+from starVLA.dataloader.gr00t_lerobot.transform import ComposedModalityTransform
+
+from functools import partial
+from typing import Tuple, List
+import pickle
+
+# LeRobot v2.0 dataset file names 
+LE_ROBOT_MODALITY_FILENAME = "meta/modality.json"
+LE_ROBOT_EPISODE_FILENAME = "meta/episodes.jsonl"
+LE_ROBOT_TASKS_FILENAME = "meta/tasks.jsonl"
+LE_ROBOT_INFO_FILENAME = "meta/info.json"
+LE_ROBOT_STATS_FILENAME = "meta/stats_gr00t.json"
+LE_ROBOT_DATA_FILENAME = "data/*/*.parquet"
+LE_ROBOT_STEPS_FILENAME = "meta/steps.pkl"
+EPSILON = 5e-4
+
+#  LeRobot v3.0 dataset file names 
+LE_ROBOT3_TASKS_FILENAME = "meta/tasks.parquet"
+LE_ROBOT3_EPISODE_FILENAME = "meta/episodes/*/*.parquet"
+
+
+# =============================================================================
+# Unified Representation Layout & Helpers
+# =============================================================================
+
+STANDARD_ACTION_DIM = 37
+#
+# Unified action representation layout (0-based indices, Python slice is [start, stop)):
+# TIGHT layout: all datasets share the same 29D space for better cross-embodiment transfer.
+#
+# - 0:7   -> left_arm (7D): xyz, rpy/euler, gripper
+#            Used by: robotwin left arm; gr1 left_arm
+# - 7:14  -> right_arm (7D): same structure
+#            Used by: libero, bridge, fractal(rt1), oxe_droid (single-arm -> right slot);
+#                     robotwin right arm; gr1 right_arm
+# - 14:20 -> left_hand (6D): gr1 only
+# - 20:26 -> right_hand (6D): gr1 only
+# - 26:29 -> waist (3D): gr1 only
+# - 29:37 -> joints + gripper (8D): real_world_franka only
+#
+# Mapping:
+#   libero/bridge/fractal/oxe_droid (7D)  -> [7:14] (right_arm slot, single-arm default)
+#   robotwin (14D, left+right)             -> [0:14]
+#   gr1/robocasa (29D)                    -> [0:29]
+#   real-world (8D)                      -> [29:37] (joints + gripper)
+
+ACTION_REPRESENTATION_SLICES = {
+    # Single-arm (7D) -> right_arm slot [7:14] (single-arm default to right hand)
+    "franka": slice(7, 14),
+    "libero_franka": slice(7, 14),
+    "oxe_droid": slice(7, 14),
+    "oxe_rt1": slice(7, 14),
+    "oxe_bridge": slice(7, 14),
+
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+
+    # Humanoid (29D) -> full [0:29], standard vector 30D (index 29 pad 0)
+    "gr1": slice(0, 29),
+    "fourier_gr1_arms_waist": slice(0, 29),
+
+    # Real-world (8D) -> [29:37] (joints + gripper)
+    "real_world_franka": slice(29, 37),
+
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(7, 14),
+}
+
+STANDARD_STATE_DIM = 88
+# Mapping:
+#   robotwin (14D)              -> [0:14] (left [0:7] + right [7:14])
+#   libero/bridge/fractal (8D)  -> [14:22] (right slot)
+#   real-world (8D)             -> [22:30] (joints + gripper)
+#   gr1 (58D after sin/cos)     -> [30:88] (isolated, different transform)
+
+STATE_REPRESENTATION_SLICES = {
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+    # Single-arm (8D) -> right slot [7:15] (aligned with action right [7:14])
+    "franka": slice(14, 22),
+    "libero_franka": slice(14, 22),
+    "oxe_droid": slice(14, 22),
+    "oxe_rt1": slice(14, 22),
+    "oxe_bridge": slice(14, 22),
+    # Real-world (8D) -> [22:30] (joints + gripper)
+    "real_world_franka": slice(22, 30),
+    # GR1 isolated [30:88] (58D, has StateActionSinCosTransform - different pipeline)
+    "gr1": slice(30, 88),
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(14, 22),
+}
+
+
+def standardize_action_representation(
+    action: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot action to a fixed-size standard action vector."""
+    target_slice = ACTION_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in ACTION_REPRESENTATION_SLICES:
+             target_slice = ACTION_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for action mapping. "
+                f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if action.shape[-1] != expected_dim:
+        raise ValueError(
+            f"Action dim mismatch for tag '{embodiment_tag}': "
+            f"{action.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*action.shape[:-1], STANDARD_ACTION_DIM), dtype=action.dtype
+    )
+    standard[..., target_slice] = action
+    return standard
+
+
+def standardize_state_representation(
+    state: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot state to a fixed-size standard state vector."""
+
+    target_slice = STATE_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in STATE_REPRESENTATION_SLICES:
+             target_slice = STATE_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for state mapping. "
+                f"Known tags: {sorted(STATE_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if state.shape[-1] != expected_dim:
+        raise ValueError(
+            f"State dim mismatch for tag '{embodiment_tag}': "
+            f"{state.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*state.shape[:-1], STANDARD_STATE_DIM), dtype=state.dtype
+    )
+    standard[..., target_slice] = state
+    return standard
+
+
+def calculate_dataset_statistics(parquet_paths: list[Path]) -> dict:
+    """Calculate the dataset statistics of all columns for a list of parquet files."""
+    # Dataset statistics
+    all_low_dim_data_list = []
+    # Collect all the data
+    # parquet_paths = parquet_paths[:3]
+    for parquet_path in tqdm(
+        sorted(list(parquet_paths)),
+        desc="Collecting all parquet files...",
+    ):
+        # Load the parquet file
+        parquet_data = pd.read_parquet(parquet_path)
+        parquet_data = parquet_data
+        all_low_dim_data_list.append(parquet_data)
+    
+    all_low_dim_data = pd.concat(all_low_dim_data_list, axis=0)
+    # Compute dataset statistics
+    dataset_statistics = {}
+    for le_modality in all_low_dim_data.columns:
+        if le_modality.startswith("annotation."):
+            continue
+        print(f"Computing statistics for {le_modality}...")
+        np_data = np.vstack(
+            [np.asarray(x, dtype=np.float32) for x in all_low_dim_data[le_modality]]
+        )
+        dataset_statistics[le_modality] = {
+            "mean": np.mean(np_data, axis=0).tolist(),
+            "std": np.std(np_data, axis=0).tolist(),
+            "min": np.min(np_data, axis=0).tolist(),
+            "max": np.max(np_data, axis=0).tolist(),
+            "q01": np.quantile(np_data, 0.01, axis=0).tolist(),
+            "q99": np.quantile(np_data, 0.99, axis=0).tolist(),
+        }
+    return dataset_statistics
+
+
+class ModalityConfig(BaseModel):
+    """Configuration for a modality."""
+
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+
+
+class LeRobotSingleDataset(Dataset):
+    """
+    Base dataset class for LeRobot that supports sharding.
+    """
+    def __init__(
+        self,
+        dataset_path: Path | str,
+        modality_configs: dict[str, ModalityConfig],
+        embodiment_tag: str | EmbodimentTag,
+        video_backend: str = "decord",
+        video_backend_kwargs: dict | None = None,
+        transforms: ComposedModalityTransform | None = None,
+        delete_pause_frame: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset.
+
+        Args:
+            dataset_path (Path | str): The path to the dataset.
+            modality_configs (dict[str, ModalityConfig]): The configuration for each modality. The keys are the modality names, and the values are the modality configurations.
+                See `ModalityConfig` for more details.
+            video_backend (str): Backend for video reading.
+            video_backend_kwargs (dict): Keyword arguments for the video backend when initializing the video reader.
+            transforms (ComposedModalityTransform): The transforms to apply to the dataset.
+            embodiment_tag (EmbodimentTag): Overload the embodiment tag for the dataset. e.g. define it as "new_embodiment"
+        """
+        # first check if the path directory exists
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset path {dataset_path} does not exist")
+        data_cfg = kwargs.get("data_cfg", {}) or {}
+        # indict letobot version
+        self._lerobot_version = data_cfg.get("lerobot_version", "v2.0") #self._indict_lerobot_version(**kwargs)
+        self.load_video = data_cfg.get("load_video", True)
+
+        self.delete_pause_frame = delete_pause_frame
+
+        # If video loading is disabled, skip video modality end-to-end.
+        if self.load_video:
+            self.modality_configs = modality_configs
+        else:
+            self.modality_configs = {
+                modality: config
+                for modality, config in modality_configs.items()
+                if modality != "video"
+            }
+        self.video_backend = video_backend
+        self.video_backend_kwargs = video_backend_kwargs if video_backend_kwargs is not None else {}
+        self.transforms = (
+            transforms if transforms is not None else ComposedModalityTransform(transforms=[])
+        )
+
+        self._dataset_path = Path(dataset_path)
+        self._dataset_name = self._dataset_path.name
+        self._dataset_id = DATASET_NAME_TO_ID.get(self._dataset_name)
+        if isinstance(embodiment_tag, EmbodimentTag):
+            self.tag = embodiment_tag.value
+        else:
+            self.tag = embodiment_tag
+
+        self._metadata = self._get_metadata(EmbodimentTag(self.tag))
+
+        # LeRobot-specific config
+        self._lerobot_modality_meta = self._get_lerobot_modality_meta()
+        self._lerobot_info_meta = self._get_lerobot_info_meta()
+        self._data_path_pattern = self._get_data_path_pattern()
+        self._video_path_pattern = self._get_video_path_pattern()
+        self._chunk_size = self._get_chunk_size()
+        self._tasks = self._get_tasks()
+        self.curr_traj_data = None
+        self.curr_traj_id = None
+
+        self._trajectory_ids, self._trajectory_lengths = self._get_trajectories()
+        self._modality_keys = self._get_modality_keys()
+        self._delta_indices = self._get_delta_indices()
+        self._all_steps = self._get_all_steps()
+        self.set_transforms_metadata(self.metadata)
+        self.set_epoch(0)
+
+        print(f"Initialized dataset {self.dataset_name} with {embodiment_tag}")
+
+
+        # Check if the dataset is valid
+        self._check_integrity()
+
+    @property
+    def dataset_path(self) -> Path:
+        """The path to the dataset that contains the METADATA_FILENAME file."""
+        return self._dataset_path
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """The metadata for the dataset, loaded from metadata.json in the dataset directory"""
+        return self._metadata
+
+    @property
+    def trajectory_ids(self) -> np.ndarray:
+        """The trajectory IDs in the dataset, stored as a 1D numpy array of strings."""
+        return self._trajectory_ids
+
+    @property
+    def trajectory_lengths(self) -> np.ndarray:
+        """The trajectory lengths in the dataset, stored as a 1D numpy array of integers.
+        The order of the lengths is the same as the order of the trajectory IDs.
+        """
+        return self._trajectory_lengths
+
+    @property
+    def all_steps(self) -> list[tuple[int, int]]:
+        """The trajectory IDs and base indices for all steps in the dataset.
+        Example:
+            self.trajectory_ids: [0, 1, 2]
+            self.trajectory_lengths: [3, 2, 4]
+            return: [
+                ("traj_0", 0), ("traj_0", 1), ("traj_0", 2),
+                ("traj_1", 0), ("traj_1", 1),
+                ("traj_2", 0), ("traj_2", 1), ("traj_2", 2), ("traj_2", 3)
+            ]
+        """
+        return self._all_steps
+
+    @property
+    def modality_keys(self) -> dict:
+        """The modality keys for the dataset. The keys are the modality names, and the values are the keys for each modality.
+
+        Example: {
+            "video": ["video.image_side_0", "video.image_side_1"],
+            "state": ["state.eef_position", "state.eef_rotation"],
+            "action": ["action.eef_position", "action.eef_rotation"],
+            "language": ["language.human.task"],
+            "timestamp": ["timestamp"],
+            "reward": ["reward"],
+        }
+        """
+        return self._modality_keys
+
+    @property
+    def delta_indices(self) -> dict[str, np.ndarray]:
+        """The delta indices for the dataset. The keys are the modality.key, and the values are the delta indices for each modality.key."""
+        return self._delta_indices
+
+    @property
+    def dataset_name(self) -> str:
+        """The name of the dataset."""
+        return self._dataset_name
+
+    @property
+    def lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_modality_meta
+
+    @property
+    def lerobot_info_meta(self) -> dict:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_info_meta
+
+    @property
+    def data_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._data_path_pattern
+
+    @property
+    def video_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._video_path_pattern
+
+    @property
+    def chunk_size(self) -> int:
+        """The chunk size for the LeRobot dataset."""
+        return self._chunk_size
+
+    @property
+    def tasks(self) -> pd.DataFrame:
+        """The tasks for the dataset."""
+        return self._tasks
+
+    def _get_metadata(self, embodiment_tag: EmbodimentTag) -> DatasetMetadata:
+        """Get the metadata for the dataset.
+
+        Returns:
+            dict: The metadata for the dataset.
+        """
+
+        # 1. Modality metadata
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        # 1.1. State and action modalities
+        simplified_modality_meta: dict[str, dict] = {}
+        with open(modality_meta_path, "r") as f:
+            le_modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        for modality in ["state", "action"]:
+            simplified_modality_meta[modality] = {}
+            le_state_action_meta: dict[str, LeRobotStateActionMetadata] = getattr(
+                le_modality_meta, modality
+            )
+            for subkey in le_state_action_meta:
+                state_action_dtype = np.dtype(le_state_action_meta[subkey].dtype)
+                if np.issubdtype(state_action_dtype, np.floating):
+                    continuous = True
+                else:
+                    continuous = False
+                simplified_modality_meta[modality][subkey] = {
+                    "absolute": le_state_action_meta[subkey].absolute,
+                    "rotation_type": le_state_action_meta[subkey].rotation_type,
+                    "shape": [
+                        le_state_action_meta[subkey].end - le_state_action_meta[subkey].start
+                    ],
+                    "continuous": continuous,
+                }
+
+        # 1.2. Video modalities
+        le_info_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        assert (
+            le_info_path.exists()
+        ), f"Please provide a {LE_ROBOT_INFO_FILENAME} file in {self.dataset_path}"
+        with open(le_info_path, "r") as f:
+            le_info = json.load(f)
+        simplified_modality_meta["video"] = {}
+        for new_key in le_modality_meta.video:
+            original_key = le_modality_meta.video[new_key].original_key
+            if original_key is None:
+                original_key = new_key
+            le_video_meta = le_info["features"][original_key]
+            height = le_video_meta["shape"][le_video_meta["names"].index("height")]
+            width = le_video_meta["shape"][le_video_meta["names"].index("width")]
+            # NOTE(FH): different lerobot dataset versions have different keys for the number of channels and fps
+            try:
+                channels = le_video_meta["shape"][le_video_meta["names"].index("channel")]
+                fps = le_video_meta["video_info"]["video.fps"]
+            except (ValueError, KeyError):
+                # channels = le_video_meta["shape"][le_video_meta["names"].index("channels")]
+                channels = le_video_meta["info"]["video.channels"]
+                fps = le_video_meta["info"]["video.fps"]
+            simplified_modality_meta["video"][new_key] = {
+                "resolution": [width, height],
+                "channels": channels,
+                "fps": fps,
+            }
+
+        # 2. Dataset statistics
+        stats_path = self.dataset_path / LE_ROBOT_STATS_FILENAME
+        try:
+            with open(stats_path, "r") as f:
+                le_statistics = json.load(f)
+            for stat in le_statistics.values():
+                DatasetStatisticalValues.model_validate(stat)
+        except (FileNotFoundError, ValidationError) as e:
+            print(f"Failed to load dataset statistics: {e}")
+            print(f"Calculating dataset statistics for {self.dataset_name}")
+            # Get all parquet files in the dataset paths
+            parquet_files = list((self.dataset_path).glob(LE_ROBOT_DATA_FILENAME))
+            parquet_files_filtered = []
+            #  parquet_files[0].name = "episode_033675.parquet" is broken file
+            for pf in parquet_files:
+                if "episode_033675.parquet" in pf.name:
+                    continue
+                parquet_files_filtered.append(pf)
+            
+            le_statistics = calculate_dataset_statistics(parquet_files_filtered)
+            with open(stats_path, "w") as f:
+                json.dump(le_statistics, f, indent=4)
+        dataset_statistics = {}
+        for our_modality in ["state", "action"]:
+            dataset_statistics[our_modality] = {}
+            for subkey in simplified_modality_meta[our_modality]:
+                dataset_statistics[our_modality][subkey] = {}
+                state_action_meta = le_modality_meta.get_key_meta(f"{our_modality}.{subkey}")
+                assert isinstance(state_action_meta, LeRobotStateActionMetadata)
+                le_modality = state_action_meta.original_key
+                for stat_name in le_statistics[le_modality]:
+                    indices = np.arange(
+                        state_action_meta.start,
+                        state_action_meta.end,
+                    )
+                    stat = np.array(le_statistics[le_modality][stat_name])
+                    dataset_statistics[our_modality][subkey][stat_name] = stat[indices].tolist()
+
+        # 3. Full dataset metadata
+        metadata = DatasetMetadata(
+            statistics=dataset_statistics,  # type: ignore
+            modalities=simplified_modality_meta,  # type: ignore
+            embodiment_tag=embodiment_tag,
+        )
+
+        return metadata
+
+    def _get_trajectories(self) -> tuple[np.ndarray, np.ndarray]:
+        """Get the trajectories in the dataset."""
+        # Get trajectory lengths, IDs, and whitelist from dataset metadata
+        # v2.0
+        if self._lerobot_version == "v2.0":
+            file_path = self.dataset_path / LE_ROBOT_EPISODE_FILENAME
+            with open(file_path, "r") as f:
+                episode_metadata = [json.loads(line) for line in f]
+            trajectory_ids = []
+            trajectory_lengths = []
+            for episode in episode_metadata:
+                trajectory_ids.append(episode["episode_index"])
+                trajectory_lengths.append(episode["length"])
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+        # v3.0
+        elif self._lerobot_version == "v3.0":
+            file_paths = list((self.dataset_path).glob(LE_ROBOT3_EPISODE_FILENAME))
+            trajectory_ids = []
+            trajectory_lengths = []
+            # data_chunck_index = []
+            # data_file_index = []
+            # vido_from_index = []
+            self.trajectory_ids_to_metadata = {}
+            for file_path in file_paths:
+                episodes_data = pd.read_parquet(file_path)
+                for index, episode in episodes_data.iterrows():
+                    trajectory_ids.append(episode["episode_index"])
+                    trajectory_lengths.append(episode["length"])
+
+                    # TODO auto map key? just map to file_path and file_from_index
+                    episode_meta = {
+                        "data/chunk_index": episode["data/chunk_index"],
+                        "data/file_index": episode["data/file_index"],
+                        "data/file_from_index": index,
+                    }
+                    if self.load_video:
+                        episode_meta["videos/observation.images.wrist/from_timestamp"] = episode[
+                            "videos/observation.images.wrist/from_timestamp"
+                        ]
+                    self.trajectory_ids_to_metadata[trajectory_ids[-1]] = episode_meta
+
+            # 这里应该可以直接读取到 save index 信息
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+
+    def _get_all_steps(self) -> list[tuple[int, int]]:
+        """Get the trajectory IDs and base indices for all steps in the dataset.
+
+        Returns:
+            list[tuple[str, int]]: A list of (trajectory_id, base_index) tuples.
+        """
+        # Create a hash key based on configuration to ensure cache validity
+        config_key = self._get_steps_config_key()
+        
+        # Create a unique filename based on config_key
+        # steps_filename = f"steps_{config_key}.pkl"
+        # @BUG
+        # fast get static steps @fangjing --> don't use hash to dynamic sample
+        steps_filename =  "steps_data_index.pkl"
+
+
+        steps_path = self.dataset_path / "meta" / steps_filename
+        
+        # Try to load cached steps first
+        try:
+            if steps_path.exists():
+                with open(steps_path, "rb") as f:
+                    cached_data = pickle.load(f)
+                return cached_data["steps"]
+
+        except (FileNotFoundError, pickle.PickleError, KeyError) as e:
+            print(f"Failed to load cached steps: {e}")
+            print("Computing steps from scratch...")
+
+        # Compute steps using single process
+        all_steps = self._get_all_steps_single_process()
+        
+        # Cache the computed steps with unique filename
+        try:
+            cache_data = {
+                "config_key": config_key,
+                "steps": all_steps,
+                "num_trajectories": len(self.trajectory_ids),
+                "total_steps": len(all_steps),
+                "computed_timestamp": pd.Timestamp.now().isoformat(),
+                "delete_pause_frame": self.delete_pause_frame,
+            }
+            
+            # Ensure the meta directory exists
+            steps_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(steps_path, "wb") as f:
+                pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
+            print(f"Cached steps saved to {steps_path}")
+        except Exception as e:
+            print(f"Failed to cache steps: {e}")
+        
+        return all_steps
+
+    def _get_steps_config_key(self) -> str:
+        """Generate a configuration key for steps caching."""
+        config_dict = {
+            "delete_pause_frame": self.delete_pause_frame,
+            "dataset_name": self.dataset_name,
+        }
+        # Create a hash of the configuration
+        config_str = str(sorted(config_dict.items()))
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]  #
+
+
+    def _get_all_steps_single_process(self) -> list[tuple[int, int]]:
+        """Original single-process implementation as fallback."""
+        all_steps: list[tuple[int, int]] = []
+        skipped_trajectories = 0
+        processed_trajectories = 0
+        
+        # Check if language modality is configured
+        has_language_modality = 'language' in self.modality_keys and len(self.modality_keys['language']) > 0
+        # TODO why trajectory_length here, why not use data length?
+        for trajectory_id, trajectory_length in tqdm(zip(self.trajectory_ids, self.trajectory_lengths), total=len(self.trajectory_ids), desc="Getting All Step"):
+            try:
+                if self._lerobot_version == "v2.0":
+                    data = self.get_trajectory_data(trajectory_id)
+                elif self._lerobot_version == "v3.0":
+                    data = self.get_trajectory_data_lerobot_v3(trajectory_id)
+                
+                trajectory_skipped = False
+            
+                # Check if trajectory has valid language instruction (if language modality is configured)
+                if has_language_modality:
+                    self.curr_traj_data = data  # Set current trajectory data for get_language to work
+
+                    language_instruction = self.get_language(trajectory_id, self.modality_keys['language'][0], 0)
+                    if not language_instruction or language_instruction[0] == "":
+                        print(f"Skipping trajectory {trajectory_id} due to empty language instruction")
+                        skipped_trajectories += 1
+                        trajectory_skipped = True
+                        continue
+
+            except Exception as e:
+                print(f"Skipping trajectory {trajectory_id} due to read error: {e}")
+                skipped_trajectories += 1
+                trajectory_skipped = True
+                continue
+        
+            if not trajectory_skipped:
+                processed_trajectories += 1
+        
+            for base_index in range(trajectory_length):
+                all_steps.append((trajectory_id, base_index))
+                
+        # Print summary statistics
+        print(f"Single-process summary: Processed {processed_trajectories} trajectories, skipped {skipped_trajectories} empty trajectories")
+        print(f"Total steps: {len(all_steps)} from {len(self.trajectory_ids)} trajectories")
+                   
+        return all_steps
+
+    def _get_position_and_gripper_values(self, data: pd.DataFrame) -> tuple[list, list]:
+        """Get position and gripper values based on available columns in the dataset."""
+        # Get action keys from modality_keys
+        action_keys = self.modality_keys.get('action', [])
+        
+        # Extract position data
+        delta_position_values = None
+        position_candidates = ['delta_eef_position']
+        coordinate_candidates = ['x', 'y', 'z']
+        
+        # First try combined position fields
+        for pos_key in position_candidates:
+            full_key = f"action.{pos_key}"
+            if full_key in action_keys:
+                try:
+                    # Get the lerobot key for this modality
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    subkey = pos_key
+                    if subkey in le_action_cfg:
+                        le_key = le_action_cfg[subkey].original_key or subkey
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[subkey].start, le_action_cfg[subkey].end)
+                            filtered_data = data_array[:, le_indices]
+                            delta_position_values = filtered_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        # If combined fields not found, try individual x,y,z coordinates
+        if delta_position_values is None:
+            x_data, y_data, z_data = None, None, None
+            for coord in coordinate_candidates:
+                full_key = f"action.{coord}"
+                if full_key in action_keys:
+                    try:
+                        le_action_cfg = self.lerobot_modality_meta.action
+                        if coord in le_action_cfg:
+                            le_key = le_action_cfg[coord].original_key or coord
+                            if le_key in data.columns:
+                                data_array = np.stack(data[le_key])
+                                le_indices = np.arange(le_action_cfg[coord].start, le_action_cfg[coord].end)
+                                coord_data = data_array[:, le_indices].flatten()
+                                if coord == 'x':
+                                    x_data = coord_data
+                                elif coord == 'y':
+                                    y_data = coord_data
+                                elif coord == 'z':
+                                    z_data = coord_data
+                    except Exception:
+                        continue
+            
+            if x_data is not None and y_data is not None and z_data is not None:
+                delta_position_values = np.column_stack((x_data, y_data, z_data)).tolist()
+        
+        if delta_position_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.delta_eef_position' in data.columns:
+                delta_position_values = data['action.delta_eef_position'].to_numpy().tolist()
+            elif all(col in data.columns for col in ['action.x', 'action.y', 'action.z']):
+                x_vals = data['action.x'].to_numpy()
+                y_vals = data['action.y'].to_numpy() 
+                z_vals = data['action.z'].to_numpy()
+                delta_position_values = np.column_stack((x_vals, y_vals, z_vals)).tolist()
+            else:
+                raise ValueError(f"No suitable position columns found. Available columns: {data.columns.tolist()}")
+        
+        # Extract gripper data
+        gripper_values = None
+        gripper_candidates = ['gripper_close', 'gripper']
+        
+        for grip_key in gripper_candidates:
+            full_key = f"action.{grip_key}"
+            if full_key in action_keys:
+                try:
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    if grip_key in le_action_cfg:
+                        le_key = le_action_cfg[grip_key].original_key or grip_key
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[grip_key].start, le_action_cfg[grip_key].end)
+                            gripper_data = data_array[:, le_indices].flatten()
+                            gripper_values = gripper_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        if gripper_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.gripper_close' in data.columns:
+                gripper_values = data['action.gripper_close'].to_numpy().tolist()
+            elif 'action.gripper' in data.columns:
+                gripper_values = data['action.gripper'].to_numpy().tolist()
+            else:
+                raise ValueError(f"No suitable gripper columns found. Available columns: {data.columns.tolist()}")
+        
+        return delta_position_values, gripper_values
+
+    def _get_modality_keys(self) -> dict:
+        """Get the modality keys for the dataset.
+        The keys are the modality names, and the values are the keys for each modality.
+        See property `modality_keys` for the expected format.
+        """
+        modality_keys = defaultdict(list)
+        for modality, config in self.modality_configs.items():
+            modality_keys[modality] = config.modality_keys
+        return modality_keys
+
+    def _get_delta_indices(self) -> dict[str, np.ndarray]:
+        """Restructure the delta indices to use modality.key as keys instead of just the modalities."""
+        delta_indices: dict[str, np.ndarray] = {}
+        for config in self.modality_configs.values():
+            for key in config.modality_keys:
+                delta_indices[key] = np.array(config.delta_indices)
+        return delta_indices
+
+    def _get_lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """Get the metadata for the LeRobot dataset."""
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        with open(modality_meta_path, "r") as f:
+            modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        return modality_meta
+
+    def _get_lerobot_info_meta(self) -> dict:
+        """Get the metadata for the LeRobot dataset."""
+        info_meta_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        with open(info_meta_path, "r") as f:
+            info_meta = json.load(f)
+        return info_meta
+
+    def _get_data_path_pattern(self) -> str:
+        """Get the data path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["data_path"]
+
+    def _get_video_path_pattern(self) -> str:
+        """Get the video path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["video_path"]
+
+    def _get_chunk_size(self) -> int:
+        """Get the chunk size for the LeRobot dataset."""
+        return self.lerobot_info_meta["chunks_size"]
+
+    def _get_tasks(self) -> pd.DataFrame:
+        """Get the tasks for the dataset."""
+        if self._lerobot_version == "v2.0":
+            tasks_path = self.dataset_path / LE_ROBOT_TASKS_FILENAME
+            with open(tasks_path, "r") as f:
+                tasks = [json.loads(line) for line in f]
+            df = pd.DataFrame(tasks)
+            return df.set_index("task_index")
+        
+        elif self._lerobot_version == "v3.0":
+            tasks_path = self.dataset_path / LE_ROBOT3_TASKS_FILENAME
+            df = pd.read_parquet(tasks_path)
+            df = df.reset_index()  # 把索引变成一列，列名通常为 'index'
+            df = df.rename(columns={'index': 'task'})  # 把 'index' 列重命名为 'task'
+            df = df[['task_index', 'task']]  # 调整列顺序
+            return df
+    def _check_integrity(self):
+        """Use the config to check if the keys are valid and detect silent data corruption."""
+        ERROR_MSG_HEADER = f"Error occurred in initializing dataset {self.dataset_name}:\n"
+
+        for modality_config in self.modality_configs.values():
+            for key in modality_config.modality_keys:
+                if key == "lapa_action" or key == "dream_actions":
+                    continue  # no need for any metadata for lapa actions because it comes normalized
+                # Check if the key is valid
+                try:
+                    self.lerobot_modality_meta.get_key_meta(key)
+                except Exception as e:
+                    raise ValueError(
+                        ERROR_MSG_HEADER + f"Unable to find key {key} in modality metadata:\n{e}"
+                    )
+
+    def set_transforms_metadata(self, metadata: DatasetMetadata):
+        """Set the metadata for the transforms. This is useful for transforms that need to know the metadata, such as the normalization values."""
+        self.transforms.set_metadata(metadata)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+
+    def __len__(self) -> int:
+        """Get the total number of data points in the dataset.
+
+        Returns:
+            int: the total number of data points in the dataset.
+        """
+        return len(self.all_steps)
+
+    def __str__(self) -> str:
+        """Get the description of the dataset."""
+        return f"{self.dataset_name} ({len(self)} steps)"
+
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single step in a trajectory.
+
+        Args:
+            index (int): The index of the step to get.
+
+        Returns:
+            dict: The data for the step.
+        """
+        trajectory_id, base_index = self.all_steps[index]
+        data = self.get_step_data(trajectory_id, base_index)
+        
+        # Process all video keys dynamically
+        images = []
+        for video_key in self.modality_keys.get("video", []):
+            image = data[video_key][0]
+
+            image = Image.fromarray(image).resize((224, 224))
+            images.append(image)
+        
+        # Get language and action data
+        language = data[self.modality_keys["language"][0]][0]
+        action = []
+        for action_key in self.modality_keys["action"]:
+            action.append(data[action_key])
+        action = np.concatenate(action, axis=1)
+        action = standardize_action_representation(action, self.tag)
+
+        state = []
+        for state_key in self.modality_keys["state"]:
+            state.append(data[state_key])
+        state = np.concatenate(state, axis=1)
+        state = standardize_state_representation(state, self.tag)
+        
+        return dict(action=action, state=state, image=images, language=language, dataset_id=self._dataset_id)
+
+    def get_step_data(self, trajectory_id: int, base_index: int) -> dict:
+        """Get the RAW data for a single step in a trajectory. No transforms are applied.
+
+        Args:
+            trajectory_id (int): The name of the trajectory.
+            base_index (int): The base step index in the trajectory.
+
+        Returns:
+            dict: The RAW data for the step.
+
+        Example return:
+            {
+                "video": {
+                    "video.image_side_0": [B, T, H, W, C],
+                    "video.image_side_1": [B, T, H, W, C],
+                },
+                "state": {
+                    "state.eef_position": [B, T, state_dim],
+                    "state.eef_rotation": [B, T, state_dim],
+                },
+                "action": {
+                    "action.eef_position": [B, T, action_dim],
+                    "action.eef_rotation": [B, T, action_dim],
+                },
+            }
+        """
+        data = {}
+        # Get the data for all modalities # just for action base data
+        self.curr_traj_data = self.get_trajectory_data(trajectory_id)
+        # TODO @JinhuiYE The logic below is poorly implemented. Data reading should be directly based on curr_traj_data.
+        for modality in self.modality_keys:
+            # Get the data corresponding to each key in the modality
+            for key in self.modality_keys[modality]:
+                data[key] = self.get_data_by_modality(trajectory_id, modality, key, base_index)
+        return data
+
+    def get_trajectory_data(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory."""
+        if self._lerobot_version == "v2.0":
+        
+            if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+                return self.curr_traj_data
+            else:
+                chunk_index = self.get_episode_chunk(trajectory_id)
+                parquet_path = self.dataset_path / self.data_path_pattern.format(
+                    episode_chunk=chunk_index, episode_index=trajectory_id
+                )
+                assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+                return pd.read_parquet(parquet_path)
+        elif self._lerobot_version == "v3.0":
+            return self.get_trajectory_data_lerobot_v3(trajectory_id)
+    
+    def get_trajectory_data_lerobot_v3(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory from lerobot v3."""
+        if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+            return self.curr_traj_data
+        else: #TODO check detail later
+            chunk_index = self.get_episode_chunk(trajectory_id)
+
+            file_index = self.get_episode_file_index(trajectory_id)
+            # file_from_index = self.get_episode_file_from_index(trajectory_id)
+            
+            
+            parquet_path = self.dataset_path / self.data_path_pattern.format(
+                chunk_index=chunk_index, file_index=file_index
+            )
+            assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+            file_data = pd.read_parquet(parquet_path)
+            
+            # filter by trajectory_id
+            episode_data = file_data.loc[file_data["episode_index"] == trajectory_id].copy()
+            
+            # fix timestamp from epis index to file index for video alignment
+            if self.load_video:
+                from_timestamp = self.trajectory_ids_to_metadata[trajectory_id].get(
+                    "videos/observation.images.wrist/from_timestamp", 0
+                )
+                episode_data["timestamp"] = episode_data["timestamp"] + from_timestamp
+            
+            return episode_data
+
+
+    def get_trajectory_index(self, trajectory_id: int) -> int:
+        """Get the index of the trajectory in the dataset by the trajectory ID.
+        This is useful when you need to get the trajectory length or sampling weight corresponding to the trajectory ID.
+
+        Args:
+            trajectory_id (str): The ID of the trajectory.
+
+        Returns:
+            int: The index of the trajectory in the dataset.
+        """
+        trajectory_indices = np.where(self.trajectory_ids == trajectory_id)[0]
+        if len(trajectory_indices) != 1:
+            raise ValueError(
+                f"Error finding trajectory index for {trajectory_id}, found {trajectory_indices=}"
+            )
+        return trajectory_indices[0]
+
+    def get_episode_chunk(self, ep_index: int) -> int:
+        """Get the chunk index for an episode index."""
+        return ep_index // self.chunk_size
+    def get_episode_file_index(self, ep_index: int) -> int:
+        """Get the file index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_index"]
+    
+    def get_episode_file_from_index(self, ep_index: int) -> int:
+        """Get the file from index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_from_index"]
+
+
+    def retrieve_data_and_pad(
+        self,
+        array: np.ndarray,
+        step_indices: np.ndarray,
+        max_length: int,
+        padding_strategy: str = "first_last",
+    ) -> np.ndarray:
+        """Retrieve the data from the dataset and pad it if necessary.
+        Args:
+            array (np.ndarray): The array to retrieve the data from.
+            step_indices (np.ndarray): The step indices to retrieve the data for.
+            max_length (int): The maximum length of the data.
+            padding_strategy (str): The padding strategy, either "first" or "last".
+        """
+        # Get the padding indices
+        front_padding_indices = step_indices < 0
+        end_padding_indices = step_indices >= max_length
+        padding_positions = np.logical_or(front_padding_indices, end_padding_indices)
+        # Retrieve the data with the non-padding indices
+        # If there exists some padding, Given T step_indices, the shape of the retrieved data will be (T', ...) where T' < T
+        raw_data = array[step_indices[~padding_positions]]
+        assert isinstance(raw_data, np.ndarray), f"{type(raw_data)=}"
+        # This is the shape of the output, (T, ...)
+        if raw_data.ndim == 1:
+            expected_shape = (len(step_indices),)
+        else:
+            expected_shape = (len(step_indices), *array.shape[1:])
+
+        # Pad the data
+        output = np.zeros(expected_shape)
+        # Assign the non-padded data
+        output[~padding_positions] = raw_data
+        # If there exists some padding, pad the data
+        if padding_positions.any():
+            if padding_strategy == "first_last":
+                # Use first / last step data to pad
+                front_padding_data = array[0]
+                end_padding_data = array[-1]
+                output[front_padding_indices] = front_padding_data
+                output[end_padding_indices] = end_padding_data
+            elif padding_strategy == "zero":
+                # Use zero padding
+                output[padding_positions] = 0
+            else:
+                raise ValueError(f"Invalid padding strategy: {padding_strategy}")
+        return output
+
+    def get_video_path(self, trajectory_id: int, key: str) -> Path:
+        chunk_index = self.get_episode_chunk(trajectory_id)
+        original_key = self.lerobot_modality_meta.video[key].original_key
+        if original_key is None:
+            original_key = key
+        if self._lerobot_version == "v2.0":
+            video_filename = self.video_path_pattern.format(
+                episode_chunk=chunk_index, episode_index=trajectory_id, video_key=original_key
+            )
+        elif self._lerobot_version == "v3.0":
+            episode_meta = self.trajectory_ids_to_metadata[trajectory_id]
+            video_filename = self.video_path_pattern.format(
+                video_key=original_key,
+                chunk_index=episode_meta["data/chunk_index"],
+                file_index=episode_meta["data/file_index"],
+            )
+        return self.dataset_path / video_filename
+
+    def get_video(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the video frames for a trajectory by a base index.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (str): The ID of the trajectory.
+            key (str): The key of the video.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The video frames for the trajectory and frame indices. Shape: (T, H, W, C)
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # print(f"{step_indices=}")
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Ensure the indices are within the valid range
+        # This is equivalent to padding the video with extra frames at the beginning and end
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, self.trajectory_lengths[trajectory_index] - 1)
+        assert key.startswith("video."), f"Video key must start with 'video.', got {key}"
+        # Get the sub-key
+        key = key.replace("video.", "")
+        video_path = self.get_video_path(trajectory_id, key)
+        # Get the action/state timestamps for each frame in the video
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert "timestamp" in self.curr_traj_data.columns, f"No timestamp found in {trajectory_id=}"
+        timestamp: np.ndarray = self.curr_traj_data["timestamp"].to_numpy()
+        # Get the corresponding video timestamps from the step indices
+        video_timestamp = timestamp[step_indices]
+
+        return get_frames_by_timestamps(
+            video_path.as_posix(),
+            video_timestamp,
+            video_backend=self.video_backend, # TODO
+            video_backend_kwargs=self.video_backend_kwargs,
+        )
+
+    def get_state_or_action(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the state or action data for a trajectory by a base index.
+        If the step indices are out of range, pad with the data:
+            if the data is stored in absolute format, pad with the first or last step data;
+            otherwise, pad with zero.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The data for the trajectory and step indices.
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        assert key.startswith(modality + "."), f"{key} must start with {modality + '.'}, got {key}"
+        # Get the sub-key, e.g. state.joint_angles -> joint_angles
+        key = key.replace(modality + ".", "")
+        # Get the lerobot key
+        le_state_or_action_cfg = getattr(self.lerobot_modality_meta, modality)
+        le_key = le_state_or_action_cfg[key].original_key
+        if le_key is None:
+            le_key = key
+        # Get the data array, shape: (T, D)
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert le_key in self.curr_traj_data.columns, f"No {le_key} found in {trajectory_id=}"
+        data_array: np.ndarray = np.stack(self.curr_traj_data[le_key])  # type: ignore
+        assert data_array.ndim == 2, f"Expected 2D array, got key {le_key} is{data_array.shape} array"
+        le_indices = np.arange(
+            le_state_or_action_cfg[key].start,
+            le_state_or_action_cfg[key].end,
+        )
+        data_array = data_array[:, le_indices]
+        # Get the state or action configuration
+        state_or_action_cfg = getattr(self.metadata.modalities, modality)[key]
+
+        # Pad the data
+        return self.retrieve_data_and_pad(
+            array=data_array,
+            step_indices=step_indices,
+            max_length=max_length,
+            padding_strategy="first_last" if state_or_action_cfg.absolute else "zero",
+            # padding_strategy="zero",           # HACK for realdata
+        )
+
+    def get_language(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> list[str]:
+        """Get the language annotation data for a trajectory by step indices.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            key (str): The key of the annotation.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            list[str]: The annotation data for the trajectory and step indices. If no matching data is found, return empty strings.
+        """
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        # Get the end times corresponding to the closest indices
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, max_length - 1)
+        # Get the annotations
+        task_indices: list[int] = []
+        assert key.startswith(
+            "annotation."
+        ), f"Language key must start with 'annotation.', got {key}"
+        subkey = key.replace("annotation.", "")
+        annotation_meta = self.lerobot_modality_meta.annotation
+        assert annotation_meta is not None, f"Annotation metadata is None for {subkey}"
+        assert (
+            subkey in annotation_meta
+        ), f"Annotation key {subkey} not found in metadata, available annotation keys: {annotation_meta.keys()}"
+        subkey_meta = annotation_meta[subkey]
+        original_key = subkey_meta.original_key
+        if original_key is None:
+            original_key = key
+        for i in range(len(step_indices)): # 
+            # task_indices.append(self.curr_traj_data[original_key][step_indices[i]].item())
+            value = self.curr_traj_data[original_key].iloc[step_indices[i]] # TODO check v2.0 
+            task_indices.append(value if isinstance(value, (int, float)) else value.item())
+
+        return self.tasks.loc[task_indices]["task"].tolist()
+
+    def get_data_by_modality(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ):
+        """Get the data corresponding to the modality for a trajectory by a base index.
+        This method will call the corresponding helper method based on the modality.
+        See the helper methods for more details.
+        NOTE: For the language modality, the data is padded with empty strings if no matching data is found.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+        """
+        if modality == "video":
+            return self.get_video(trajectory_id, key, base_index)
+        elif modality == "state" or modality == "action":
+            return self.get_state_or_action(trajectory_id, modality, key, base_index)
+        elif modality == "language":
+            return self.get_language(trajectory_id, key, base_index)
+        else:
+            raise ValueError(f"Invalid modality: {modality}")
+
+    def _save_dataset_statistics_(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the dataset.
+        Key order follows modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Get used modality keys
+        used_action_keys, used_state_keys = get_used_modality_keys(self.modality_keys)
+        
+        # Organize statistics by tag
+        tag = self.tag
+        tag_stats = {}
+        
+        # Process action statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'action') and self.metadata.statistics.action:
+            action_stats = self.metadata.statistics.action
+            filtered_action_stats = {
+                key: action_stats[key]
+                for key in used_action_keys
+                if key in action_stats
+            }
+            
+            if filtered_action_stats:
+                # Combine statistics from filtered action sub-keys
+                combined_action_stats = combine_modality_stats(filtered_action_stats)
+                
+                # Add mask field based on whether it's gripper or not
+                mask = generate_action_mask_for_used_keys(
+                    self.metadata.modalities.action, filtered_action_stats.keys()
+                )
+                combined_action_stats["mask"] = mask
+                
+                tag_stats["action"] = combined_action_stats
+        
+        # Process state statistics (only for used keys, config order)
+        if hasattr(self.metadata.statistics, 'state') and self.metadata.statistics.state:
+            state_stats = self.metadata.statistics.state
+            filtered_state_stats = {
+                key: state_stats[key]
+                for key in used_state_keys
+                if key in state_stats
+            }
+            
+            if filtered_state_stats:
+                combined_state_stats = combine_modality_stats(filtered_state_stats)
+                tag_stats["state"] = combined_state_stats
+        
+        # Add dataset counts
+        tag_stats["num_transitions"] = len(self)
+        tag_stats["num_trajectories"] = len(self.trajectory_ids)
+        
+        statistics_data[tag] = tag_stats
+        
+        # Save as JSON file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Single dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(used_action_keys)}")
+        print(f"Used state keys (reordered): {list(used_state_keys)}")
+
+
+
+class MixtureSpecElement(BaseModel):
+    dataset_path: list[Path] | Path = Field(..., description="The path to the dataset.")
+    dataset_weight: float = Field(..., description="The weight of the dataset in the mixture.")
+    distribute_weights: bool = Field(
+        default=False,
+        description="Whether to distribute the weights of the dataset across all the paths. If True, the weights will be evenly distributed across all the paths.",
+    )
+
+
+# Helper functions for dataset statistics
+
+def combine_modality_stats(modality_stats: dict) -> dict:
+    """
+    Combine statistics from all sub-keys under a modality.
+    
+    Args:
+        modality_stats (dict): Statistics for a modality, containing multiple sub-keys.
+                               Each sub-key contains DatasetStatisticalValues object.
+        
+    Returns:
+        dict: Combined statistics
+    """
+    combined_stats = {
+        "mean": [],
+        "std": [],
+        "max": [],
+        "min": [],
+        "q01": [],
+        "q99": []
+    }
+    
+    # Combine statistics in sub-key order
+    for subkey in modality_stats.keys():
+        subkey_stats = modality_stats[subkey]  # This is a DatasetStatisticalValues object
+        
+        # Convert DatasetStatisticalValues to dict-like access
+        for stat_name in ["mean", "std", "max", "min", "q01", "q99"]:
+            stat_value = getattr(subkey_stats, stat_name)
+            if isinstance(stat_value, (list, tuple)):
+                combined_stats[stat_name].extend(stat_value)
+            else:
+                # Handle NDArray case - convert to list
+                if hasattr(stat_value, 'tolist'):
+                    combined_stats[stat_name].extend(stat_value.tolist())
+                else:
+                    combined_stats[stat_name].append(float(stat_value))
+    
+    return combined_stats
+
+def generate_action_mask_for_used_keys(action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+    """
+    Generate mask based on action modalities, but only for used keys.
+    All dimensions are set to True so every channel is de/normalized.
+    
+    Args:
+        action_modalities (dict): Configuration information for action modalities.
+        used_action_keys_ordered: Iterable of actually used action keys in the correct order.
+        
+    Returns:
+        list[bool]: List of mask values
+    """
+    mask = []
+    
+    # Generate mask in the same order as the statistics were combined
+    for subkey in used_action_keys_ordered:
+        if subkey in action_modalities:
+            subkey_config = action_modalities[subkey]
+            
+            # Get dimension count from shape
+            if hasattr(subkey_config, 'shape') and len(subkey_config.shape) > 0:
+                dim_count = subkey_config.shape[0]
+            else:
+                dim_count = 1
+            
+            # Check if it's gripper-related
+            is_gripper = "gripper" in subkey.lower()
+            
+            # Generate mask value for each dimension
+            for _ in range(dim_count):
+                mask.append(not is_gripper)  # gripper is False, others are True
+    
+    return mask
+
+def get_used_modality_keys(modality_keys: dict) -> tuple[set, set]:
+    """Extract used action and state keys from modality configuration."""
+    used_action_keys = []
+    used_state_keys = []
+    
+    # Extract action keys (remove "action." prefix)
+    for action_key in modality_keys.get("action", []):
+        if action_key.startswith("action."):
+            clean_key = action_key.replace("action.", "")
+            used_action_keys.append(clean_key)
+    
+    # Extract state keys (remove "state." prefix)  
+    for state_key in modality_keys.get("state", []):
+        if state_key.startswith("state."):
+            clean_key = state_key.replace("state.", "")
+            used_state_keys.append(clean_key)
+    
+    return used_action_keys, used_state_keys
+
+
+def safe_hash(input_tuple):
+    # keep 128 bits of the hash
+    tuple_string = repr(input_tuple).encode("utf-8")
+    sha256 = hashlib.sha256()
+    sha256.update(tuple_string)
+
+    seed = int(sha256.hexdigest(), 16)
+
+    return seed & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+    
+
+class LeRobotMixtureDataset(Dataset):
+    """
+    A mixture of multiple datasets. This class samples a single dataset based on the dataset weights and then calls the `__getitem__` method of the sampled dataset.
+    It is recommended to modify the single dataset class instead of this class.
+    """
+
+    def __init__(
+        self,
+        data_mixture: Sequence[tuple[LeRobotSingleDataset, float]],
+        mode: str,
+        balance_dataset_weights: bool = True,
+        balance_trajectory_weights: bool = True,
+        seed: int = 42,
+        metadata_config: dict = {
+            "percentile_mixing_method": "min_max",
+        },
+        **kwargs,
+    ):
+        """
+        Initialize the mixture dataset.
+
+        Args:
+            data_mixture (list[tuple[LeRobotSingleDataset, float]]): Datasets and their corresponding weights.
+            mode (str): If "train", __getitem__ will return different samples every epoch; if "val" or "test", __getitem__ will return the same sample every epoch.
+            balance_dataset_weights (bool): If True, the weight of dataset will be multiplied by the total trajectory length of each dataset.
+            balance_trajectory_weights (bool): If True, sample trajectories within a dataset weighted by their length; otherwise, use equal weighting.
+            seed (int): Random seed for sampling.
+        """
+        datasets: list[LeRobotSingleDataset] = []
+        dataset_sampling_weights: list[float] = []
+        for dataset, weight in data_mixture:
+            # Check if dataset is valid and has data
+            if len(dataset) == 0:
+                print(f"Warning: Skipping empty dataset {dataset.dataset_name}")
+                continue
+            datasets.append(dataset)
+            dataset_sampling_weights.append(weight)
+        
+        if len(datasets) == 0:
+            raise ValueError("No valid datasets found in the mixture. All datasets are empty.")
+        
+        self.datasets = datasets
+        self.balance_dataset_weights = balance_dataset_weights
+        self.balance_trajectory_weights = balance_trajectory_weights
+        self.seed = seed
+        self.mode = mode
+
+        # Set properties for sampling
+
+        # 1. Dataset lengths
+        self._dataset_lengths = np.array([len(dataset) for dataset in self.datasets])
+        print(f"Dataset lengths: {self._dataset_lengths}")
+
+        # 2. Dataset sampling weights
+        self._dataset_sampling_weights = np.array(dataset_sampling_weights)
+        
+        if self.balance_dataset_weights:
+            self._dataset_sampling_weights *= self._dataset_lengths
+        
+        # Check for zero or negative weights before normalization
+        if np.any(self._dataset_sampling_weights <= 0):
+            print(f"Warning: Found zero or negative sampling weights: {self._dataset_sampling_weights}")
+            # Set minimum weight to prevent division issues
+            self._dataset_sampling_weights = np.maximum(self._dataset_sampling_weights, 1e-8)
+        
+        # Normalize weights
+        weights_sum = self._dataset_sampling_weights.sum()
+        if weights_sum == 0 or np.isnan(weights_sum):
+            print(f"Error: Invalid weights sum: {weights_sum}")
+            # Fallback to equal weights
+            self._dataset_sampling_weights = np.ones(len(self.datasets)) / len(self.datasets)
+            print(f"Fallback to equal weights")
+        else:
+            self._dataset_sampling_weights /= weights_sum
+
+        # 3. Trajectory sampling weights
+        self._trajectory_sampling_weights: list[np.ndarray] = []
+        for i, dataset in enumerate(self.datasets):
+            trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths))
+            if self.balance_trajectory_weights:
+                trajectory_sampling_weights *= dataset.trajectory_lengths
+            
+            # Check for zero or negative weights before normalization
+            if np.any(trajectory_sampling_weights <= 0):
+                print(f"Warning: Dataset {i} has zero or negative trajectory weights")
+                trajectory_sampling_weights = np.maximum(trajectory_sampling_weights, 1e-8)
+            
+            # Normalize weights
+            weights_sum = trajectory_sampling_weights.sum()
+            if weights_sum == 0 or np.isnan(weights_sum):
+                print(f"Error: Dataset {i} has invalid trajectory weights sum: {weights_sum}")
+                # Fallback to equal weights
+                trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths)) / len(dataset.trajectory_lengths)
+            else:
+                trajectory_sampling_weights /= weights_sum
+            
+            self._trajectory_sampling_weights.append(trajectory_sampling_weights)
+
+        # 4. Primary dataset indices
+        self._primary_dataset_indices = np.array(dataset_sampling_weights) == 1.0
+        if not np.any(self._primary_dataset_indices):
+            print(f"Warning: No dataset with weight 1.0 found. Original weights: {dataset_sampling_weights}")
+            # Fallback: use the dataset(s) with maximum weight as primary
+            max_weight = max(dataset_sampling_weights)
+            self._primary_dataset_indices = np.array(dataset_sampling_weights) == max_weight
+            print(f"Using datasets with maximum weight {max_weight} as primary: {self._primary_dataset_indices}")
+            
+        if not np.any(self._primary_dataset_indices):
+            # This should never happen, but just in case
+            print("Error: Still no primary dataset found. Using first dataset as primary.")
+            self._primary_dataset_indices = np.zeros(len(self.datasets), dtype=bool)
+            self._primary_dataset_indices[0] = True
+
+        # Set the epoch and sample the first epoch
+        self.set_epoch(0)
+
+        self.update_metadata(metadata_config)
+
+    @property
+    def dataset_lengths(self) -> np.ndarray:
+        """The lengths of each dataset."""
+        return self._dataset_lengths
+
+    @property
+    def dataset_sampling_weights(self) -> np.ndarray:
+        """The sampling weights for each dataset."""
+        return self._dataset_sampling_weights
+
+    @property
+    def trajectory_sampling_weights(self) -> list[np.ndarray]:
+        """The sampling weights for each trajectory in each dataset."""
+        return self._trajectory_sampling_weights
+
+    @property
+    def primary_dataset_indices(self) -> np.ndarray:
+        """The indices of the primary datasets."""
+        return self._primary_dataset_indices
+
+    def __str__(self) -> str:
+        dataset_descriptions = []
+        for dataset, weight in zip(self.datasets, self.dataset_sampling_weights):
+            dataset_description = {
+                "Dataset": str(dataset),
+                "Sampling weight": float(weight),
+            }
+            dataset_descriptions.append(dataset_description)
+        return json.dumps({"Mixture dataset": dataset_descriptions}, indent=2)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+        # self.sampled_steps = self.sample_epoch()
+
+    def sample_step(self, index: int) -> tuple[LeRobotSingleDataset, int, int]:
+        """Sample a single step from the dataset."""
+        # return self.sampled_steps[index]
+
+        # Set seed
+        seed = index if self.mode != "train" else safe_hash((self.epoch, index, self.seed))
+        rng = np.random.default_rng(seed)
+
+        # Sample dataset
+        dataset_index = rng.choice(len(self.datasets), p=self.dataset_sampling_weights)
+        dataset = self.datasets[dataset_index]
+
+        # Sample trajectory
+        # trajectory_index = rng.choice(
+        #     len(dataset.trajectory_ids), p=self.trajectory_sampling_weights[dataset_index]
+        # )
+        # trajectory_id = dataset.trajectory_ids[trajectory_index]
+
+        # # Sample step
+        # base_index = rng.choice(dataset.trajectory_lengths[trajectory_index])
+        # return dataset, trajectory_id, base_index
+        single_step_index = rng.choice(len(dataset.all_steps))
+        trajectory_id, base_index = dataset.all_steps[single_step_index]
+        return dataset, trajectory_id, base_index
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single trajectory and start index.
+
+        Args:
+            index (int): The index of the trajectory to get.
+
+        Returns:
+            dict: The data for the trajectory and start index.
+        """
+        max_retries = 10
+        last_exception = None
+        
+        for attempt in range(max_retries):
+            try:
+                dataset, trajectory_name, step = self.sample_step(index)
+                data_raw = dataset.get_step_data(trajectory_name, step)
+                data = dataset.transforms(data_raw)
+                
+                # Process all video keys dynamically
+                images = []
+                for video_key in dataset.modality_keys.get("video", []):
+                    image = data[video_key][0]
+                    
+                    image = Image.fromarray(image).resize((224, 224)) #TODO check if this is ok
+                    images.append(image)
+                
+                # Get language and action data
+                language = data[dataset.modality_keys["language"][0]][0]
+                action = []
+                for action_key in dataset.modality_keys["action"]:
+                    action.append(data[action_key])
+                action = np.concatenate(action, axis=1).astype(np.float16)
+                action = standardize_action_representation(action, dataset.tag)
+                
+                state = []
+                for state_key in dataset.modality_keys["state"]:
+                    state.append(data[state_key])
+                state = np.concatenate(state, axis=1).astype(np.float16)
+                state = standardize_state_representation(state, dataset.tag)
+                
+                return dict(action=action, state=state, image=images, lang=language, dataset_id=dataset._dataset_id)
+                
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    # Log the error but continue trying
+                    print(f"Attempt {attempt + 1}/{max_retries} failed for index {index}: {e}")
+                    print(f"Retrying with new sample...")
+                    # For retry, we can use a slightly different index to get a new sample
+                    # This helps avoid getting stuck on the same problematic sample
+                    index = random.randint(0, len(self) - 1)
+                else:
+                    # All retries exhausted
+                    print(f"All {max_retries} attempts failed for index {index}")
+                    print(f"Last error: {last_exception}")
+                    # Return a dummy sample or re-raise the exception
+                    raise last_exception
+
+    def __len__(self) -> int:
+        """Get the length of a single epoch in the mixture.
+
+        Returns:
+            int: The length of a single epoch in the mixture.
+        """
+        # Check for potential issues
+        if len(self.datasets) == 0:
+            return 0
+            
+        # Check if any dataset lengths are 0 or NaN
+        if np.any(self.dataset_lengths == 0) or np.any(np.isnan(self.dataset_lengths)):
+            print(f"Warning: Found zero or NaN dataset lengths: {self.dataset_lengths}")
+            # Filter out zero/NaN length datasets
+            valid_indices = (self.dataset_lengths > 0) & (~np.isnan(self.dataset_lengths))
+            if not np.any(valid_indices):
+                print("Error: All datasets have zero or NaN length")
+                return 0
+        else:
+            valid_indices = np.ones(len(self.datasets), dtype=bool)
+        
+        # Check if any sampling weights are 0 or NaN
+        if np.any(self.dataset_sampling_weights == 0) or np.any(np.isnan(self.dataset_sampling_weights)):
+            print(f"Warning: Found zero or NaN sampling weights: {self.dataset_sampling_weights}")
+            # Use only valid weights
+            valid_weights = (self.dataset_sampling_weights > 0) & (~np.isnan(self.dataset_sampling_weights))
+            valid_indices = valid_indices & valid_weights
+            if not np.any(valid_indices):
+                print("Error: All sampling weights are zero or NaN")
+                return 0
+        
+        # Check primary dataset indices
+        primary_and_valid = self.primary_dataset_indices & valid_indices
+        if not np.any(primary_and_valid):
+            print(f"Warning: No valid primary datasets found. Primary indices: {self.primary_dataset_indices}, Valid indices: {valid_indices}")
+            # Fallback: use the largest valid dataset
+            if np.any(valid_indices):
+                max_length = self.dataset_lengths[valid_indices].max()
+                print(f"Fallback: Using maximum dataset length: {max_length}")
+                return int(max_length)
+            else:
+                return 0
+        
+        # Calculate the ratio and get max
+        ratios = (self.dataset_lengths / self.dataset_sampling_weights)[primary_and_valid]
+        
+        # Check for NaN or inf in ratios
+        if np.any(np.isnan(ratios)) or np.any(np.isinf(ratios)):
+            print(f"Warning: Found NaN or inf in ratios: {ratios}")
+            print(f"Dataset lengths: {self.dataset_lengths[primary_and_valid]}")
+            print(f"Sampling weights: {self.dataset_sampling_weights[primary_and_valid]}")
+            # Filter out invalid ratios
+            valid_ratios = ratios[~np.isnan(ratios) & ~np.isinf(ratios)]
+            if len(valid_ratios) == 0:
+                print("Error: All ratios are NaN or inf")
+                return 0
+            max_ratio = valid_ratios.max()
+        else:
+            max_ratio = ratios.max()
+        
+        result = int(max_ratio)
+        if result == 0:
+            print(f"Warning: Dataset mixture length is 0")
+        return result
+
+    @staticmethod
+    def compute_overall_statistics(
+        per_task_stats: list[dict[str, dict[str, list[float] | np.ndarray]]],
+        dataset_sampling_weights: list[float] | np.ndarray,
+        percentile_mixing_method: str = "weighted_average",
+    ) -> dict[str, dict[str, list[float]]]:
+        """
+        Computes overall statistics from per-task statistics using dataset sample weights.
+
+        Args:
+            per_task_stats: List of per-task statistics.
+            Example format of one element in the per-task statistics list:
+                {
+                    "state.gripper": {
+                        "min": [...],
+                        "max": [...],
+                        "mean": [...],
+                        "std": [...],
+                        "q01": [...],
+                        "q99": [...],
+                    },
+                    ...
+                }
+            dataset_sampling_weights: List of sample weights for each task.
+            percentile_mixing_method: The method to mix the percentiles, either "weighted_average" or "weighted_std".
+
+        Returns:
+            A dict of overall statistics per modality.
+        """
+        # Normalize the sample weights to sum to 1
+        dataset_sampling_weights = np.array(dataset_sampling_weights)
+        normalized_weights = dataset_sampling_weights / dataset_sampling_weights.sum()
+
+        # Initialize overall statistics dict
+        overall_stats: dict[str, dict[str, list[float]]] = {}
+
+        # Get the list of modality keys
+        modality_keys = per_task_stats[0].keys()
+
+        for modality in modality_keys:
+            # Number of dimensions (assuming consistent across tasks)
+            num_dims = len(per_task_stats[0][modality]["mean"])
+
+            # Initialize accumulators for means and variances
+            weighted_means = np.zeros(num_dims)
+            weighted_squares = np.zeros(num_dims)
+
+            # Collect min, max, q01, q99 from all tasks
+            min_list = []
+            max_list = []
+            q01_list = []
+            q99_list = []
+
+            for task_idx, task_stats in enumerate(per_task_stats):
+                w_i = normalized_weights[task_idx]
+                stats = task_stats[modality]
+                means = np.array(stats["mean"])
+                stds = np.array(stats["std"])
+
+                # Update weighted sums for mean and variance
+                weighted_means += w_i * means
+                weighted_squares += w_i * (stds**2 + means**2)
+
+                # Collect min, max, q01, q99
+                min_list.append(stats["min"])
+                max_list.append(stats["max"])
+                q01_list.append(stats["q01"])
+                q99_list.append(stats["q99"])
+
+            # Compute overall mean
+            overall_mean = weighted_means.tolist()
+
+            # Compute overall variance and std deviation
+            overall_variance = weighted_squares - weighted_means**2
+            overall_std = np.sqrt(overall_variance).tolist()
+
+            # Compute overall min and max per dimension
+            overall_min = np.min(np.array(min_list), axis=0).tolist()
+            overall_max = np.max(np.array(max_list), axis=0).tolist()
+
+            # Compute overall q01 and q99 per dimension
+            # Use weighted average of per-task quantiles
+            q01_array = np.array(q01_list)
+            q99_array = np.array(q99_list)
+            if percentile_mixing_method == "weighted_average":
+                weighted_q01 = np.average(q01_array, axis=0, weights=normalized_weights).tolist()
+                weighted_q99 = np.average(q99_array, axis=0, weights=normalized_weights).tolist()
+                # std_q01 = np.std(q01_array, axis=0).tolist()
+                # std_q99 = np.std(q99_array, axis=0).tolist()
+                # print(modality)
+                # print(f"{std_q01=}, {std_q99=}")
+                # print(f"{weighted_q01=}, {weighted_q99=}")
+            elif percentile_mixing_method == "min_max":
+                weighted_q01 = np.min(q01_array, axis=0).tolist()
+                weighted_q99 = np.max(q99_array, axis=0).tolist()
+            else:
+                raise ValueError(f"Invalid percentile mixing method: {percentile_mixing_method}")
+
+            # Store the overall statistics for the modality
+            overall_stats[modality] = {
+                "min": overall_min,
+                "max": overall_max,
+                "mean": overall_mean,
+                "std": overall_std,
+                "q01": weighted_q01,
+                "q99": weighted_q99,
+            }
+
+        return overall_stats
+
+    @staticmethod
+    def merge_metadata(
+        metadatas: list[DatasetMetadata],
+        dataset_sampling_weights: list[float],
+        percentile_mixing_method: str,
+    ) -> DatasetMetadata:
+        """Merge multiple metadata into one."""
+        # Convert to dicts
+        metadata_dicts = [metadata.model_dump(mode="json") for metadata in metadatas]
+        # Create a new metadata dict
+        merged_metadata = {}
+
+        # Check all metadata have the same embodiment tag
+        assert all(
+            metadata.embodiment_tag == metadatas[0].embodiment_tag for metadata in metadatas
+        ), "All metadata must have the same embodiment tag"
+        merged_metadata["embodiment_tag"] = metadatas[0].embodiment_tag
+
+        # Merge the dataset statistics
+        dataset_statistics = {}
+        dataset_statistics["state"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["state"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        dataset_statistics["action"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["action"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        merged_metadata["statistics"] = dataset_statistics
+
+        # Merge the modality configs
+        modality_configs = defaultdict(set)
+        for metadata in metadata_dicts:
+            for modality, configs in metadata["modalities"].items():
+                modality_configs[modality].add(json.dumps(configs))
+        merged_metadata["modalities"] = {}
+        for modality, configs in modality_configs.items():
+            # Check that all modality configs correspond to the same tag matches
+            assert (
+                len(configs) == 1
+            ), f"Multiple modality configs for modality {modality}: {list(configs)}"
+            merged_metadata["modalities"][modality] = json.loads(configs.pop())
+
+        return DatasetMetadata.model_validate(merged_metadata)
+
+    def update_metadata(self, metadata_config: dict, cached_statistics_path: Path | str | None = None) -> None:
+        """
+        Merge multiple metadatas into one and set the transforms with the merged metadata.
+
+        Args:
+            metadata_config (dict): Configuration for the metadata.
+                "percentile_mixing_method": The method to mix the percentiles, either "weighted_average" or "min_max".
+                    weighted_average: Use the weighted average of the percentiles using the weight used in sampling the datasets.
+                    min_max: Use the min of the 1st percentile and max of the 99th percentile.
+        """
+        # If cached path is provided, try to load and apply
+        if cached_statistics_path is not None:
+            try:
+                cached_stats = self.load_merged_statistics(cached_statistics_path)
+                self.apply_cached_statistics(cached_stats)
+                return
+            except (FileNotFoundError, KeyError, ValidationError) as e:
+                print(f"Failed to load cached statistics: {e}")
+                print("Falling back to computing statistics from scratch...")
+
+        self.tag = EmbodimentTag.NEW_EMBODIMENT.value
+        self.merged_metadata: dict[str, DatasetMetadata] = {}
+        # Group metadata by tag
+        all_metadatas: dict[str, list[DatasetMetadata]] = {}
+        for dataset in self.datasets:
+            if dataset.tag not in all_metadatas:
+                all_metadatas[dataset.tag] = []
+            all_metadatas[dataset.tag].append(dataset.metadata)
+        for tag, metadatas in all_metadatas.items():
+            self.merged_metadata[tag] = self.merge_metadata(
+                metadatas=metadatas,
+                dataset_sampling_weights=self.dataset_sampling_weights.tolist(),
+                percentile_mixing_method=metadata_config["percentile_mixing_method"],
+            )
+        for dataset in self.datasets:
+            dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+
+    def save_dataset_statistics(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save merged dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the datasets.
+        Key order follows each tag's modality config order.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Keep key orders per embodiment tag (from modality config order)
+        tag_to_used_action_keys = {}
+        tag_to_used_state_keys = {}
+        for dataset in self.datasets:
+            if dataset.tag in tag_to_used_action_keys:
+                continue
+            used_action_keys, used_state_keys = get_used_modality_keys(dataset.modality_keys)
+            tag_to_used_action_keys[dataset.tag] = used_action_keys
+            tag_to_used_state_keys[dataset.tag] = used_state_keys
+        
+        # Organize statistics by tag
+        for tag, merged_metadata in self.merged_metadata.items():
+            tag_stats = {}
+            
+            # Process action statistics
+            if hasattr(merged_metadata.statistics, 'action') and merged_metadata.statistics.action:
+                action_stats = merged_metadata.statistics.action
+                
+                used_action_keys = tag_to_used_action_keys.get(tag, [])
+                filtered_action_stats = {
+                    key: action_stats[key]
+                    for key in used_action_keys
+                    if key in action_stats
+                }
+                
+                if filtered_action_stats:
+                    combined_action_stats = combine_modality_stats(filtered_action_stats)
+                    
+                    mask = generate_action_mask_for_used_keys(
+                        merged_metadata.modalities.action, filtered_action_stats.keys()
+                    )
+                    combined_action_stats["mask"] = mask
+                    
+                    tag_stats["action"] = combined_action_stats
+            
+            # Process state statistics
+            if hasattr(merged_metadata.statistics, 'state') and merged_metadata.statistics.state:
+                state_stats = merged_metadata.statistics.state
+                
+                used_state_keys = tag_to_used_state_keys.get(tag, [])
+                filtered_state_stats = {
+                    key: state_stats[key]
+                    for key in used_state_keys
+                    if key in state_stats
+                }
+                
+                if filtered_state_stats:
+                    combined_state_stats = combine_modality_stats(filtered_state_stats)
+                    tag_stats["state"] = combined_state_stats
+            
+            # Add dataset counts
+            tag_stats.update(self._get_dataset_counts(tag))
+            
+            statistics_data[tag] = tag_stats
+        
+        # Save file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Merged dataset statistics saved to: {save_path}")
+        print(f"Used action keys by tag: {tag_to_used_action_keys}")
+        print(f"Used state keys by tag: {tag_to_used_state_keys}")
+
+
+    def _combine_modality_stats(self, modality_stats: dict) -> dict:
+        """Backward compatibility wrapper."""
+        return combine_modality_stats(modality_stats)
+
+    def _generate_action_mask_for_used_keys(self, action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+        """Backward compatibility wrapper."""
+        return generate_action_mask_for_used_keys(action_modalities, used_action_keys_ordered)
+
+    def _get_dataset_counts(self, tag: str) -> dict:
+        """
+        Get dataset count information for specified tag.
+        
+        Args:
+            tag (str): embodiment tag
+            
+        Returns:
+            dict: Dictionary containing num_transitions and num_trajectories
+        """
+        num_transitions = 0
+        num_trajectories = 0
+        
+        # Count dataset information belonging to this tag
+        for dataset in self.datasets:
+            if dataset.tag == tag:
+                num_transitions += len(dataset)
+                num_trajectories += len(dataset.trajectory_ids)
+        
+        return {
+            "num_transitions": num_transitions,
+            "num_trajectories": num_trajectories
+        }
+
+    @classmethod
+    def load_merged_statistics(cls, load_path: Path | str) -> dict:
+        """
+        Load merged dataset statistics from file.
+        
+        Args:
+            load_path (Path | str): Path to the statistics file
+            
+        Returns:
+            dict: Dictionary containing merged statistics
+        """
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Statistics file not found: {load_path}")
+        
+        if load_path.suffix.lower() == '.json':
+            with open(load_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        elif load_path.suffix.lower() == '.pkl':
+            import pickle
+            with open(load_path, 'rb') as f:
+                return pickle.load(f)
+        else:
+            raise ValueError(f"Unsupported file format: {load_path.suffix}")
+
+    def apply_cached_statistics(self, cached_statistics: dict) -> None:
+        """
+        Apply cached statistics to avoid recomputation.
+        
+        Args:
+            cached_statistics (dict): Statistics loaded from file
+        """
+        # Validate that cached statistics match current datasets
+        if "metadata" in cached_statistics:
+            cached_dataset_names = set(cached_statistics["metadata"]["dataset_names"])
+            current_dataset_names = set(dataset.dataset_name for dataset in self.datasets)
+            
+            if cached_dataset_names != current_dataset_names:
+                print("Warning: Cached statistics dataset names don't match current datasets.")
+                print(f"Cached: {cached_dataset_names}")
+                print(f"Current: {current_dataset_names}")
+                return
+        
+        # Apply cached statistics
+        self.merged_metadata = {}
+        for tag, stats_data in cached_statistics.items():
+            if tag == "metadata":  # Skip metadata field
+                continue
+                
+            # Convert back to DatasetMetadata format
+            metadata_dict = {
+                "embodiment_tag": tag,
+                "statistics": {
+                    "action": {},
+                    "state": {}
+                },
+                "modalities": {}
+            }
+            
+            # Convert action statistics back
+            if "action" in stats_data:
+                action_data = stats_data["action"]
+                # This is simplified - you may need to split back to sub-keys
+                metadata_dict["statistics"]["action"] = action_data
+            
+            # Convert state statistics back
+            if "state" in stats_data:
+                state_data = stats_data["state"]
+                metadata_dict["statistics"]["state"] = state_data
+            
+            self.merged_metadata[tag] = DatasetMetadata.model_validate(metadata_dict)
+        
+        # Update transforms metadata for each dataset
+        for dataset in self.datasets:
+            if dataset.tag in self.merged_metadata:
+                dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+        
+        print(f"Applied cached statistics for {len(self.merged_metadata)} embodiment tags.")
+
diff --git a/code/dataloader_bak/gr00t_lerobot/datasets_bak.py b/code/dataloader_bak/gr00t_lerobot/datasets_bak.py
new file mode 100644
index 0000000000000000000000000000000000000000..96f6f23dcb30b35930a2cd8a153c9e6ded84ff36
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/datasets_bak.py
@@ -0,0 +1,2202 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+"""
+In this file, we define 3 types of datasets:
+1. LeRobotSingleDataset: a single dataset for a given embodiment tag
+2. LeRobotMixtureDataset: a mixture of datasets for a given list of embodiment tags
+3. CachedLeRobotSingleDataset: a single dataset for a given embodiment tag,
+                                with caching for the video frames
+
+See `scripts/load_dataset.py` for examples on how to use these datasets.
+"""
+import os
+import hashlib
+import json, torch
+from collections import defaultdict
+from pathlib import Path
+from typing import Sequence
+import os, random
+import numpy as np
+import pandas as pd
+from pydantic import BaseModel, Field, ValidationError
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from PIL import Image
+
+from starVLA.dataloader.gr00t_lerobot.video import get_all_frames, get_frames_by_timestamps
+
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import EmbodimentTag, DATASET_NAME_TO_ID
+from starVLA.dataloader.gr00t_lerobot.schema import (
+    DatasetMetadata,
+    DatasetStatisticalValues,
+    LeRobotModalityMetadata,
+    LeRobotStateActionMetadata,
+)
+from starVLA.dataloader.gr00t_lerobot.transform import ComposedModalityTransform
+
+from functools import partial
+from typing import Tuple, List
+import pickle
+
+# LeRobot v2.0 dataset file names 
+LE_ROBOT_MODALITY_FILENAME = "meta/modality.json"
+LE_ROBOT_EPISODE_FILENAME = "meta/episodes.jsonl"
+LE_ROBOT_TASKS_FILENAME = "meta/tasks.jsonl"
+LE_ROBOT_INFO_FILENAME = "meta/info.json"
+LE_ROBOT_STATS_FILENAME = "meta/stats_gr00t.json"
+LE_ROBOT_DATA_FILENAME = "data/*/*.parquet"
+LE_ROBOT_STEPS_FILENAME = "meta/steps.pkl"
+EPSILON = 5e-4
+
+#  LeRobot v3.0 dataset file names 
+LE_ROBOT3_TASKS_FILENAME = "meta/tasks.parquet"
+LE_ROBOT3_EPISODE_FILENAME = "meta/episodes/*/*.parquet"
+
+
+# =============================================================================
+# Unified Representation Layout & Helpers
+# =============================================================================
+
+STANDARD_ACTION_DIM = 37
+#
+# Unified action representation layout (0-based indices, Python slice is [start, stop)):
+# TIGHT layout: all datasets share the same 29D space for better cross-embodiment transfer.
+#
+# - 0:7   -> left_arm (7D): xyz, rpy/euler, gripper
+#            Used by: robotwin left arm; gr1 left_arm
+# - 7:14  -> right_arm (7D): same structure
+#            Used by: libero, bridge, fractal(rt1), oxe_droid (single-arm -> right slot);
+#                     robotwin right arm; gr1 right_arm
+# - 14:20 -> left_hand (6D): gr1 only
+# - 20:26 -> right_hand (6D): gr1 only
+# - 26:29 -> waist (3D): gr1 only
+# - 29:37 -> joints + gripper (8D): real_world_franka only
+#
+# Mapping:
+#   libero/bridge/fractal/oxe_droid (7D)  -> [7:14] (right_arm slot, single-arm default)
+#   robotwin (14D, left+right)             -> [0:14]
+#   gr1/robocasa (29D)                    -> [0:29]
+#   real-world (8D)                      -> [29:37] (joints + gripper)
+
+ACTION_REPRESENTATION_SLICES = {
+    # Single-arm (7D) -> right_arm slot [7:14] (single-arm default to right hand)
+    "franka": slice(7, 14),
+    "libero_franka": slice(7, 14),
+    "oxe_droid": slice(7, 14),
+    "oxe_rt1": slice(7, 14),
+    "oxe_bridge": slice(7, 14),
+
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+
+    # Humanoid (29D) -> full [0:29], standard vector 30D (index 29 pad 0)
+    "gr1": slice(0, 29),
+    "fourier_gr1_arms_waist": slice(0, 29),
+
+    # Real-world (8D) -> [29:37] (joints + gripper)
+    "real_world_franka": slice(29, 37),
+
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(7, 14),
+}
+
+STANDARD_STATE_DIM = 88
+# Mapping:
+#   robotwin (14D)              -> [0:14] (left [0:7] + right [7:14])
+#   libero/bridge/fractal (8D)  -> [14:22] (right slot)
+#   real-world (8D)             -> [22:30] (joints + gripper)
+#   gr1 (58D after sin/cos)     -> [30:88] (isolated, different transform)
+
+STATE_REPRESENTATION_SLICES = {
+    # Dual-arm (14D) -> left [0:7] + right [7:14]
+    "dual_arm_franka": slice(0, 14),
+    "robotwin": slice(0, 14),
+    # Single-arm (8D) -> right slot [7:15] (aligned with action right [7:14])
+    "franka": slice(14, 22),
+    "libero_franka": slice(14, 22),
+    "oxe_droid": slice(14, 22),
+    "oxe_rt1": slice(14, 22),
+    "oxe_bridge": slice(14, 22),
+    # Real-world (8D) -> [22:30] (joints + gripper)
+    "real_world_franka": slice(22, 30),
+    # GR1 isolated [30:88] (58D, has StateActionSinCosTransform - different pipeline)
+    "gr1": slice(30, 88),
+    # Fallback (single-arm -> right slot)
+    "new_embodiment": slice(14, 22),
+}
+
+
+def standardize_action_representation(
+    action: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot action to a fixed-size standard action vector."""
+    target_slice = ACTION_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in ACTION_REPRESENTATION_SLICES:
+             target_slice = ACTION_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for action mapping. "
+                f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if action.shape[-1] != expected_dim:
+        raise ValueError(
+            f"Action dim mismatch for tag '{embodiment_tag}': "
+            f"{action.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*action.shape[:-1], STANDARD_ACTION_DIM), dtype=action.dtype
+    )
+    standard[..., target_slice] = action
+    return standard
+
+
+def standardize_state_representation(
+    state: np.ndarray, embodiment_tag: str
+) -> np.ndarray:
+    """Map per-robot state to a fixed-size standard state vector."""
+
+    target_slice = STATE_REPRESENTATION_SLICES.get(embodiment_tag)
+    
+    # Fallback to 'new_embodiment' if tag not found, or raise error
+    if target_slice is None:
+        if "new_embodiment" in STATE_REPRESENTATION_SLICES:
+             target_slice = STATE_REPRESENTATION_SLICES["new_embodiment"]
+        else:
+            raise ValueError(
+                f"Unknown embodiment tag '{embodiment_tag}' for state mapping. "
+                f"Known tags: {sorted(STATE_REPRESENTATION_SLICES)}"
+            )
+
+    expected_dim = target_slice.stop - target_slice.start
+    if state.shape[-1] != expected_dim:
+        raise ValueError(
+            f"State dim mismatch for tag '{embodiment_tag}': "
+            f"{state.shape[-1]=} vs expected {expected_dim}."
+        )
+
+    standard = np.zeros(
+        (*state.shape[:-1], STANDARD_STATE_DIM), dtype=state.dtype
+    )
+    standard[..., target_slice] = state
+    return standard
+
+
+def calculate_dataset_statistics(parquet_paths: list[Path]) -> dict:
+    """Calculate the dataset statistics of all columns for a list of parquet files."""
+    # Dataset statistics
+    all_low_dim_data_list = []
+    # Collect all the data
+    # parquet_paths = parquet_paths[:3]
+    for parquet_path in tqdm(
+        sorted(list(parquet_paths)),
+        desc="Collecting all parquet files...",
+    ):
+        # Load the parquet file
+        parquet_data = pd.read_parquet(parquet_path)
+        parquet_data = parquet_data
+        all_low_dim_data_list.append(parquet_data)
+    
+    all_low_dim_data = pd.concat(all_low_dim_data_list, axis=0)
+    # Compute dataset statistics
+    dataset_statistics = {}
+    for le_modality in all_low_dim_data.columns:
+        if le_modality.startswith("annotation."):
+            continue
+        print(f"Computing statistics for {le_modality}...")
+        np_data = np.vstack(
+            [np.asarray(x, dtype=np.float32) for x in all_low_dim_data[le_modality]]
+        )
+        dataset_statistics[le_modality] = {
+            "mean": np.mean(np_data, axis=0).tolist(),
+            "std": np.std(np_data, axis=0).tolist(),
+            "min": np.min(np_data, axis=0).tolist(),
+            "max": np.max(np_data, axis=0).tolist(),
+            "q01": np.quantile(np_data, 0.01, axis=0).tolist(),
+            "q99": np.quantile(np_data, 0.99, axis=0).tolist(),
+        }
+    return dataset_statistics
+
+
+class ModalityConfig(BaseModel):
+    """Configuration for a modality."""
+
+    delta_indices: list[int]
+    """Delta indices to sample relative to the current index. The returned data will correspond to the original data at a sampled base index + delta indices."""
+    modality_keys: list[str]
+    """The keys to load for the modality in the dataset."""
+
+
+class LeRobotSingleDataset(Dataset):
+    """
+    Base dataset class for LeRobot that supports sharding.
+    """
+    def __init__(
+        self,
+        dataset_path: Path | str,
+        modality_configs: dict[str, ModalityConfig],
+        embodiment_tag: str | EmbodimentTag,
+        video_backend: str = "decord",
+        video_backend_kwargs: dict | None = None,
+        transforms: ComposedModalityTransform | None = None,
+        delete_pause_frame: bool = False,
+        **kwargs,
+    ):
+        """
+        Initialize the dataset.
+
+        Args:
+            dataset_path (Path | str): The path to the dataset.
+            modality_configs (dict[str, ModalityConfig]): The configuration for each modality. The keys are the modality names, and the values are the modality configurations.
+                See `ModalityConfig` for more details.
+            video_backend (str): Backend for video reading.
+            video_backend_kwargs (dict): Keyword arguments for the video backend when initializing the video reader.
+            transforms (ComposedModalityTransform): The transforms to apply to the dataset.
+            embodiment_tag (EmbodimentTag): Overload the embodiment tag for the dataset. e.g. define it as "new_embodiment"
+        """
+        # first check if the path directory exists
+        if not Path(dataset_path).exists():
+            raise FileNotFoundError(f"Dataset path {dataset_path} does not exist")
+        # indict letobot version
+        self._lerobot_version =  kwargs["data_cfg"].get("lerobot_version", "v2.0") #self._indict_lerobot_version(**kwargs)
+
+        self.delete_pause_frame = delete_pause_frame
+
+        self.modality_configs = modality_configs
+        self.video_backend = video_backend
+        self.video_backend_kwargs = video_backend_kwargs if video_backend_kwargs is not None else {}
+        self.transforms = (
+            transforms if transforms is not None else ComposedModalityTransform(transforms=[])
+        )
+
+        self._dataset_path = Path(dataset_path)
+        self._dataset_name = self._dataset_path.name
+        self._dataset_id = DATASET_NAME_TO_ID.get(self._dataset_name)
+        if isinstance(embodiment_tag, EmbodimentTag):
+            self.tag = embodiment_tag.value
+        else:
+            self.tag = embodiment_tag
+
+        self._metadata = self._get_metadata(EmbodimentTag(self.tag))
+
+        # LeRobot-specific config
+        self._lerobot_modality_meta = self._get_lerobot_modality_meta()
+        self._lerobot_info_meta = self._get_lerobot_info_meta()
+        self._data_path_pattern = self._get_data_path_pattern()
+        self._video_path_pattern = self._get_video_path_pattern()
+        self._chunk_size = self._get_chunk_size()
+        self._tasks = self._get_tasks()
+        self.curr_traj_data = None
+        self.curr_traj_id = None
+
+        self._trajectory_ids, self._trajectory_lengths = self._get_trajectories()
+        self._modality_keys = self._get_modality_keys()
+        self._delta_indices = self._get_delta_indices()
+        self._all_steps = self._get_all_steps()
+        self.set_transforms_metadata(self.metadata)
+        self.set_epoch(0)
+
+        print(f"Initialized dataset {self.dataset_name} with {embodiment_tag}")
+
+
+        # Check if the dataset is valid
+        self._check_integrity()
+
+    @property
+    def dataset_path(self) -> Path:
+        """The path to the dataset that contains the METADATA_FILENAME file."""
+        return self._dataset_path
+
+    @property
+    def metadata(self) -> DatasetMetadata:
+        """The metadata for the dataset, loaded from metadata.json in the dataset directory"""
+        return self._metadata
+
+    @property
+    def trajectory_ids(self) -> np.ndarray:
+        """The trajectory IDs in the dataset, stored as a 1D numpy array of strings."""
+        return self._trajectory_ids
+
+    @property
+    def trajectory_lengths(self) -> np.ndarray:
+        """The trajectory lengths in the dataset, stored as a 1D numpy array of integers.
+        The order of the lengths is the same as the order of the trajectory IDs.
+        """
+        return self._trajectory_lengths
+
+    @property
+    def all_steps(self) -> list[tuple[int, int]]:
+        """The trajectory IDs and base indices for all steps in the dataset.
+        Example:
+            self.trajectory_ids: [0, 1, 2]
+            self.trajectory_lengths: [3, 2, 4]
+            return: [
+                ("traj_0", 0), ("traj_0", 1), ("traj_0", 2),
+                ("traj_1", 0), ("traj_1", 1),
+                ("traj_2", 0), ("traj_2", 1), ("traj_2", 2), ("traj_2", 3)
+            ]
+        """
+        return self._all_steps
+
+    @property
+    def modality_keys(self) -> dict:
+        """The modality keys for the dataset. The keys are the modality names, and the values are the keys for each modality.
+
+        Example: {
+            "video": ["video.image_side_0", "video.image_side_1"],
+            "state": ["state.eef_position", "state.eef_rotation"],
+            "action": ["action.eef_position", "action.eef_rotation"],
+            "language": ["language.human.task"],
+            "timestamp": ["timestamp"],
+            "reward": ["reward"],
+        }
+        """
+        return self._modality_keys
+
+    @property
+    def delta_indices(self) -> dict[str, np.ndarray]:
+        """The delta indices for the dataset. The keys are the modality.key, and the values are the delta indices for each modality.key."""
+        return self._delta_indices
+
+    @property
+    def dataset_name(self) -> str:
+        """The name of the dataset."""
+        return self._dataset_name
+
+    @property
+    def lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_modality_meta
+
+    @property
+    def lerobot_info_meta(self) -> dict:
+        """The metadata for the LeRobot dataset."""
+        return self._lerobot_info_meta
+
+    @property
+    def data_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._data_path_pattern
+
+    @property
+    def video_path_pattern(self) -> str:
+        """The path pattern for the LeRobot dataset."""
+        return self._video_path_pattern
+
+    @property
+    def chunk_size(self) -> int:
+        """The chunk size for the LeRobot dataset."""
+        return self._chunk_size
+
+    @property
+    def tasks(self) -> pd.DataFrame:
+        """The tasks for the dataset."""
+        return self._tasks
+
+    def _get_metadata(self, embodiment_tag: EmbodimentTag) -> DatasetMetadata:
+        """Get the metadata for the dataset.
+
+        Returns:
+            dict: The metadata for the dataset.
+        """
+
+        # 1. Modality metadata
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        # 1.1. State and action modalities
+        simplified_modality_meta: dict[str, dict] = {}
+        with open(modality_meta_path, "r") as f:
+            le_modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        for modality in ["state", "action"]:
+            simplified_modality_meta[modality] = {}
+            le_state_action_meta: dict[str, LeRobotStateActionMetadata] = getattr(
+                le_modality_meta, modality
+            )
+            for subkey in le_state_action_meta:
+                state_action_dtype = np.dtype(le_state_action_meta[subkey].dtype)
+                if np.issubdtype(state_action_dtype, np.floating):
+                    continuous = True
+                else:
+                    continuous = False
+                simplified_modality_meta[modality][subkey] = {
+                    "absolute": le_state_action_meta[subkey].absolute,
+                    "rotation_type": le_state_action_meta[subkey].rotation_type,
+                    "shape": [
+                        le_state_action_meta[subkey].end - le_state_action_meta[subkey].start
+                    ],
+                    "continuous": continuous,
+                }
+
+        # 1.2. Video modalities
+        le_info_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        assert (
+            le_info_path.exists()
+        ), f"Please provide a {LE_ROBOT_INFO_FILENAME} file in {self.dataset_path}"
+        with open(le_info_path, "r") as f:
+            le_info = json.load(f)
+        simplified_modality_meta["video"] = {}
+        for new_key in le_modality_meta.video:
+            original_key = le_modality_meta.video[new_key].original_key
+            if original_key is None:
+                original_key = new_key
+            le_video_meta = le_info["features"][original_key]
+            height = le_video_meta["shape"][le_video_meta["names"].index("height")]
+            width = le_video_meta["shape"][le_video_meta["names"].index("width")]
+            # NOTE(FH): different lerobot dataset versions have different keys for the number of channels and fps
+            try:
+                channels = le_video_meta["shape"][le_video_meta["names"].index("channel")]
+                fps = le_video_meta["video_info"]["video.fps"]
+            except (ValueError, KeyError):
+                # channels = le_video_meta["shape"][le_video_meta["names"].index("channels")]
+                channels = le_video_meta["info"]["video.channels"]
+                fps = le_video_meta["info"]["video.fps"]
+            simplified_modality_meta["video"][new_key] = {
+                "resolution": [width, height],
+                "channels": channels,
+                "fps": fps,
+            }
+
+        # 2. Dataset statistics
+        stats_path = self.dataset_path / LE_ROBOT_STATS_FILENAME
+        try:
+            with open(stats_path, "r") as f:
+                le_statistics = json.load(f)
+            for stat in le_statistics.values():
+                DatasetStatisticalValues.model_validate(stat)
+        except (FileNotFoundError, ValidationError) as e:
+            print(f"Failed to load dataset statistics: {e}")
+            print(f"Calculating dataset statistics for {self.dataset_name}")
+            # Get all parquet files in the dataset paths
+            parquet_files = list((self.dataset_path).glob(LE_ROBOT_DATA_FILENAME))
+            parquet_files_filtered = []
+            #  parquet_files[0].name = "episode_033675.parquet" is broken file
+            for pf in parquet_files:
+                if "episode_033675.parquet" in pf.name:
+                    continue
+                parquet_files_filtered.append(pf)
+            
+            le_statistics = calculate_dataset_statistics(parquet_files_filtered)
+            with open(stats_path, "w") as f:
+                json.dump(le_statistics, f, indent=4)
+        dataset_statistics = {}
+        for our_modality in ["state", "action"]:
+            dataset_statistics[our_modality] = {}
+            for subkey in simplified_modality_meta[our_modality]:
+                dataset_statistics[our_modality][subkey] = {}
+                state_action_meta = le_modality_meta.get_key_meta(f"{our_modality}.{subkey}")
+                assert isinstance(state_action_meta, LeRobotStateActionMetadata)
+                le_modality = state_action_meta.original_key
+                for stat_name in le_statistics[le_modality]:
+                    indices = np.arange(
+                        state_action_meta.start,
+                        state_action_meta.end,
+                    )
+                    stat = np.array(le_statistics[le_modality][stat_name])
+                    dataset_statistics[our_modality][subkey][stat_name] = stat[indices].tolist()
+
+        # 3. Full dataset metadata
+        metadata = DatasetMetadata(
+            statistics=dataset_statistics,  # type: ignore
+            modalities=simplified_modality_meta,  # type: ignore
+            embodiment_tag=embodiment_tag,
+        )
+
+        return metadata
+
+    def _get_trajectories(self) -> tuple[np.ndarray, np.ndarray]:
+        """Get the trajectories in the dataset."""
+        # Get trajectory lengths, IDs, and whitelist from dataset metadata
+        # v2.0
+        if self._lerobot_version == "v2.0":
+            file_path = self.dataset_path / LE_ROBOT_EPISODE_FILENAME
+            with open(file_path, "r") as f:
+                episode_metadata = [json.loads(line) for line in f]
+            trajectory_ids = []
+            trajectory_lengths = []
+            for episode in episode_metadata:
+                trajectory_ids.append(episode["episode_index"])
+                trajectory_lengths.append(episode["length"])
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+        # v3.0
+        elif self._lerobot_version == "v3.0":
+            file_paths = list((self.dataset_path).glob(LE_ROBOT3_EPISODE_FILENAME))
+            trajectory_ids = []
+            trajectory_lengths = []
+            # data_chunck_index = []
+            # data_file_index = []
+            # vido_from_index = []
+            self.trajectory_ids_to_metadata = {}
+            for file_path in file_paths:
+                episodes_data = pd.read_parquet(file_path)
+                for index, episode in episodes_data.iterrows():
+                    trajectory_ids.append(episode["episode_index"])
+                    trajectory_lengths.append(episode["length"])
+
+                    # TODO auto map key? just map to file_path and file_from_index
+                    episode_meta = {
+                        "data/chunk_index": episode["data/chunk_index"],
+                        "data/file_index": episode["data/file_index"],
+                        "data/file_from_index": index,
+                        "videos/observation.images.wrist/from_timestamp": episode["videos/observation.images.wrist/from_timestamp"],
+                    }
+                    self.trajectory_ids_to_metadata[trajectory_ids[-1]] = episode_meta
+
+            # 这里应该可以直接读取到 save index 信息
+            return np.array(trajectory_ids), np.array(trajectory_lengths)
+
+    def _get_all_steps(self) -> list[tuple[int, int]]:
+        """Get the trajectory IDs and base indices for all steps in the dataset.
+
+        Returns:
+            list[tuple[str, int]]: A list of (trajectory_id, base_index) tuples.
+        """
+        # Create a hash key based on configuration to ensure cache validity
+        config_key = self._get_steps_config_key()
+        
+        # Create a unique filename based on config_key
+        # steps_filename = f"steps_{config_key}.pkl"
+        # @BUG
+        # fast get static steps @fangjing --> don't use hash to dynamic sample
+        steps_filename =  "steps_data_index.pkl"
+
+
+        steps_path = self.dataset_path / "meta" / steps_filename
+        
+        # Try to load cached steps first
+        try:
+            if steps_path.exists():
+                with open(steps_path, "rb") as f:
+                    cached_data = pickle.load(f)
+                return cached_data["steps"]
+
+        except (FileNotFoundError, pickle.PickleError, KeyError) as e:
+            print(f"Failed to load cached steps: {e}")
+            print("Computing steps from scratch...")
+
+        # Compute steps using single process
+        all_steps = self._get_all_steps_single_process()
+        
+        # Cache the computed steps with unique filename
+        try:
+            cache_data = {
+                "config_key": config_key,
+                "steps": all_steps,
+                "num_trajectories": len(self.trajectory_ids),
+                "total_steps": len(all_steps),
+                "computed_timestamp": pd.Timestamp.now().isoformat(),
+                "delete_pause_frame": self.delete_pause_frame,
+            }
+            
+            # Ensure the meta directory exists
+            steps_path.parent.mkdir(parents=True, exist_ok=True)
+            
+            with open(steps_path, "wb") as f:
+                pickle.dump(cache_data, f, protocol=pickle.HIGHEST_PROTOCOL)
+            print(f"Cached steps saved to {steps_path}")
+        except Exception as e:
+            print(f"Failed to cache steps: {e}")
+        
+        return all_steps
+
+    def _get_steps_config_key(self) -> str:
+        """Generate a configuration key for steps caching."""
+        config_dict = {
+            "delete_pause_frame": self.delete_pause_frame,
+            "dataset_name": self.dataset_name,
+        }
+        # Create a hash of the configuration
+        config_str = str(sorted(config_dict.items()))
+        return hashlib.md5(config_str.encode()).hexdigest()[:12]  #
+
+
+    def _get_all_steps_single_process(self) -> list[tuple[int, int]]:
+        """Original single-process implementation as fallback."""
+        all_steps: list[tuple[int, int]] = []
+        skipped_trajectories = 0
+        processed_trajectories = 0
+        
+        # Check if language modality is configured
+        has_language_modality = 'language' in self.modality_keys and len(self.modality_keys['language']) > 0
+        # TODO why trajectory_length here, why not use data length?
+        for trajectory_id, trajectory_length in tqdm(zip(self.trajectory_ids, self.trajectory_lengths), total=len(self.trajectory_ids), desc="Getting All Step"):
+            try:
+                if self._lerobot_version == "v2.0":
+                    data = self.get_trajectory_data(trajectory_id)
+                elif self._lerobot_version == "v3.0":
+                    data = self.get_trajectory_data_lerobot_v3(trajectory_id)
+                
+                trajectory_skipped = False
+            
+                # Check if trajectory has valid language instruction (if language modality is configured)
+                if has_language_modality:
+                    self.curr_traj_data = data  # Set current trajectory data for get_language to work
+
+                    language_instruction = self.get_language(trajectory_id, self.modality_keys['language'][0], 0)
+                    if not language_instruction or language_instruction[0] == "":
+                        print(f"Skipping trajectory {trajectory_id} due to empty language instruction")
+                        skipped_trajectories += 1
+                        trajectory_skipped = True
+                        continue
+
+            except Exception as e:
+                print(f"Skipping trajectory {trajectory_id} due to read error: {e}")
+                skipped_trajectories += 1
+                trajectory_skipped = True
+                continue
+        
+            if not trajectory_skipped:
+                processed_trajectories += 1
+        
+            for base_index in range(trajectory_length):
+                all_steps.append((trajectory_id, base_index))
+                
+        # Print summary statistics
+        print(f"Single-process summary: Processed {processed_trajectories} trajectories, skipped {skipped_trajectories} empty trajectories")
+        print(f"Total steps: {len(all_steps)} from {len(self.trajectory_ids)} trajectories")
+                   
+        return all_steps
+
+    def _get_position_and_gripper_values(self, data: pd.DataFrame) -> tuple[list, list]:
+        """Get position and gripper values based on available columns in the dataset."""
+        # Get action keys from modality_keys
+        action_keys = self.modality_keys.get('action', [])
+        
+        # Extract position data
+        delta_position_values = None
+        position_candidates = ['delta_eef_position']
+        coordinate_candidates = ['x', 'y', 'z']
+        
+        # First try combined position fields
+        for pos_key in position_candidates:
+            full_key = f"action.{pos_key}"
+            if full_key in action_keys:
+                try:
+                    # Get the lerobot key for this modality
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    subkey = pos_key
+                    if subkey in le_action_cfg:
+                        le_key = le_action_cfg[subkey].original_key or subkey
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[subkey].start, le_action_cfg[subkey].end)
+                            filtered_data = data_array[:, le_indices]
+                            delta_position_values = filtered_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        # If combined fields not found, try individual x,y,z coordinates
+        if delta_position_values is None:
+            x_data, y_data, z_data = None, None, None
+            for coord in coordinate_candidates:
+                full_key = f"action.{coord}"
+                if full_key in action_keys:
+                    try:
+                        le_action_cfg = self.lerobot_modality_meta.action
+                        if coord in le_action_cfg:
+                            le_key = le_action_cfg[coord].original_key or coord
+                            if le_key in data.columns:
+                                data_array = np.stack(data[le_key])
+                                le_indices = np.arange(le_action_cfg[coord].start, le_action_cfg[coord].end)
+                                coord_data = data_array[:, le_indices].flatten()
+                                if coord == 'x':
+                                    x_data = coord_data
+                                elif coord == 'y':
+                                    y_data = coord_data
+                                elif coord == 'z':
+                                    z_data = coord_data
+                    except Exception:
+                        continue
+            
+            if x_data is not None and y_data is not None and z_data is not None:
+                delta_position_values = np.column_stack((x_data, y_data, z_data)).tolist()
+        
+        if delta_position_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.delta_eef_position' in data.columns:
+                delta_position_values = data['action.delta_eef_position'].to_numpy().tolist()
+            elif all(col in data.columns for col in ['action.x', 'action.y', 'action.z']):
+                x_vals = data['action.x'].to_numpy()
+                y_vals = data['action.y'].to_numpy() 
+                z_vals = data['action.z'].to_numpy()
+                delta_position_values = np.column_stack((x_vals, y_vals, z_vals)).tolist()
+            else:
+                raise ValueError(f"No suitable position columns found. Available columns: {data.columns.tolist()}")
+        
+        # Extract gripper data
+        gripper_values = None
+        gripper_candidates = ['gripper_close', 'gripper']
+        
+        for grip_key in gripper_candidates:
+            full_key = f"action.{grip_key}"
+            if full_key in action_keys:
+                try:
+                    le_action_cfg = self.lerobot_modality_meta.action
+                    if grip_key in le_action_cfg:
+                        le_key = le_action_cfg[grip_key].original_key or grip_key
+                        if le_key in data.columns:
+                            data_array = np.stack(data[le_key])
+                            le_indices = np.arange(le_action_cfg[grip_key].start, le_action_cfg[grip_key].end)
+                            gripper_data = data_array[:, le_indices].flatten()
+                            gripper_values = gripper_data.tolist()
+                            break
+                except Exception:
+                    continue
+        
+        if gripper_values is None:
+            # Fallback to the old hardcoded approach if metadata approach fails
+            if 'action.gripper_close' in data.columns:
+                gripper_values = data['action.gripper_close'].to_numpy().tolist()
+            elif 'action.gripper' in data.columns:
+                gripper_values = data['action.gripper'].to_numpy().tolist()
+            else:
+                raise ValueError(f"No suitable gripper columns found. Available columns: {data.columns.tolist()}")
+        
+        return delta_position_values, gripper_values
+
+    def _get_modality_keys(self) -> dict:
+        """Get the modality keys for the dataset.
+        The keys are the modality names, and the values are the keys for each modality.
+        See property `modality_keys` for the expected format.
+        """
+        modality_keys = defaultdict(list)
+        for modality, config in self.modality_configs.items():
+            modality_keys[modality] = config.modality_keys
+        return modality_keys
+
+    def _get_delta_indices(self) -> dict[str, np.ndarray]:
+        """Restructure the delta indices to use modality.key as keys instead of just the modalities."""
+        delta_indices: dict[str, np.ndarray] = {}
+        for config in self.modality_configs.values():
+            for key in config.modality_keys:
+                delta_indices[key] = np.array(config.delta_indices)
+        return delta_indices
+
+    def _get_lerobot_modality_meta(self) -> LeRobotModalityMetadata:
+        """Get the metadata for the LeRobot dataset."""
+        modality_meta_path = self.dataset_path / LE_ROBOT_MODALITY_FILENAME
+        assert (
+            modality_meta_path.exists()
+        ), f"Please provide a {LE_ROBOT_MODALITY_FILENAME} file in {self.dataset_path}"
+        with open(modality_meta_path, "r") as f:
+            modality_meta = LeRobotModalityMetadata.model_validate(json.load(f))
+        return modality_meta
+
+    def _get_lerobot_info_meta(self) -> dict:
+        """Get the metadata for the LeRobot dataset."""
+        info_meta_path = self.dataset_path / LE_ROBOT_INFO_FILENAME
+        with open(info_meta_path, "r") as f:
+            info_meta = json.load(f)
+        return info_meta
+
+    def _get_data_path_pattern(self) -> str:
+        """Get the data path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["data_path"]
+
+    def _get_video_path_pattern(self) -> str:
+        """Get the video path pattern for the LeRobot dataset."""
+        return self.lerobot_info_meta["video_path"]
+
+    def _get_chunk_size(self) -> int:
+        """Get the chunk size for the LeRobot dataset."""
+        return self.lerobot_info_meta["chunks_size"]
+
+    def _get_tasks(self) -> pd.DataFrame:
+        """Get the tasks for the dataset."""
+        if self._lerobot_version == "v2.0":
+            tasks_path = self.dataset_path / LE_ROBOT_TASKS_FILENAME
+            with open(tasks_path, "r") as f:
+                tasks = [json.loads(line) for line in f]
+            df = pd.DataFrame(tasks)
+            return df.set_index("task_index")
+        
+        elif self._lerobot_version == "v3.0":
+            tasks_path = self.dataset_path / LE_ROBOT3_TASKS_FILENAME
+            df = pd.read_parquet(tasks_path)
+            df = df.reset_index()  # 把索引变成一列，列名通常为 'index'
+            df = df.rename(columns={'index': 'task'})  # 把 'index' 列重命名为 'task'
+            df = df[['task_index', 'task']]  # 调整列顺序
+            return df
+    def _check_integrity(self):
+        """Use the config to check if the keys are valid and detect silent data corruption."""
+        ERROR_MSG_HEADER = f"Error occurred in initializing dataset {self.dataset_name}:\n"
+
+        for modality_config in self.modality_configs.values():
+            for key in modality_config.modality_keys:
+                if key == "lapa_action" or key == "dream_actions":
+                    continue  # no need for any metadata for lapa actions because it comes normalized
+                # Check if the key is valid
+                try:
+                    self.lerobot_modality_meta.get_key_meta(key)
+                except Exception as e:
+                    raise ValueError(
+                        ERROR_MSG_HEADER + f"Unable to find key {key} in modality metadata:\n{e}"
+                    )
+
+    def set_transforms_metadata(self, metadata: DatasetMetadata):
+        """Set the metadata for the transforms. This is useful for transforms that need to know the metadata, such as the normalization values."""
+        self.transforms.set_metadata(metadata)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+
+    def __len__(self) -> int:
+        """Get the total number of data points in the dataset.
+
+        Returns:
+            int: the total number of data points in the dataset.
+        """
+        return len(self.all_steps)
+
+    def __str__(self) -> str:
+        """Get the description of the dataset."""
+        return f"{self.dataset_name} ({len(self)} steps)"
+
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single step in a trajectory.
+
+        Args:
+            index (int): The index of the step to get.
+
+        Returns:
+            dict: The data for the step.
+        """
+        trajectory_id, base_index = self.all_steps[index]
+        data = self.get_step_data(trajectory_id, base_index)
+        
+        # Process all video keys dynamically
+        images = []
+        for video_key in self.modality_keys["video"]:
+            image = data[video_key][0]
+
+            image = Image.fromarray(image).resize((224, 224))
+            images.append(image)
+        
+        # Get language and action data
+        language = data[self.modality_keys["language"][0]][0]
+        action = []
+        for action_key in self.modality_keys["action"]:
+            action.append(data[action_key])
+        action = np.concatenate(action, axis=1)
+        action = standardize_action_representation(action, self.tag)
+
+        state = []
+        for state_key in self.modality_keys["state"]:
+            state.append(data[state_key])
+        state = np.concatenate(state, axis=1)
+        state = standardize_state_representation(state, self.tag)
+        
+        return dict(action=action, state=state, image=images, language=language, dataset_id=self._dataset_id)
+
+    def get_step_data(self, trajectory_id: int, base_index: int) -> dict:
+        """Get the RAW data for a single step in a trajectory. No transforms are applied.
+
+        Args:
+            trajectory_id (int): The name of the trajectory.
+            base_index (int): The base step index in the trajectory.
+
+        Returns:
+            dict: The RAW data for the step.
+
+        Example return:
+            {
+                "video": {
+                    "video.image_side_0": [B, T, H, W, C],
+                    "video.image_side_1": [B, T, H, W, C],
+                },
+                "state": {
+                    "state.eef_position": [B, T, state_dim],
+                    "state.eef_rotation": [B, T, state_dim],
+                },
+                "action": {
+                    "action.eef_position": [B, T, action_dim],
+                    "action.eef_rotation": [B, T, action_dim],
+                },
+            }
+        """
+        data = {}
+        # Get the data for all modalities # just for action base data
+        self.curr_traj_data = self.get_trajectory_data(trajectory_id)
+        # TODO @JinhuiYE The logic below is poorly implemented. Data reading should be directly based on curr_traj_data.
+        for modality in self.modality_keys:
+            # Get the data corresponding to each key in the modality
+            for key in self.modality_keys[modality]:
+                data[key] = self.get_data_by_modality(trajectory_id, modality, key, base_index)
+        return data
+
+    def get_trajectory_data(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory."""
+        if self._lerobot_version == "v2.0":
+        
+            if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+                return self.curr_traj_data
+            else:
+                chunk_index = self.get_episode_chunk(trajectory_id)
+                parquet_path = self.dataset_path / self.data_path_pattern.format(
+                    episode_chunk=chunk_index, episode_index=trajectory_id
+                )
+                assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+                return pd.read_parquet(parquet_path)
+        elif self._lerobot_version == "v3.0":
+            return self.get_trajectory_data_lerobot_v3(trajectory_id)
+    
+    def get_trajectory_data_lerobot_v3(self, trajectory_id: int) -> pd.DataFrame:
+        """Get the data for a trajectory from lerobot v3."""
+        if self.curr_traj_id == trajectory_id and self.curr_traj_data is not None:
+            return self.curr_traj_data
+        else: #TODO check detail later
+            chunk_index = self.get_episode_chunk(trajectory_id)
+
+            file_index = self.get_episode_file_index(trajectory_id)
+            # file_from_index = self.get_episode_file_from_index(trajectory_id)
+            
+            
+            parquet_path = self.dataset_path / self.data_path_pattern.format(
+                chunk_index=chunk_index, file_index=file_index
+            )
+            assert parquet_path.exists(), f"Parquet file not found at {parquet_path}"
+            file_data = pd.read_parquet(parquet_path)
+            
+            # filter by trajectory_id
+            episode_data = file_data.loc[file_data["episode_index"] == trajectory_id].copy()
+            
+            # fix timestamp from epis index to file index
+            from_timestamp = self.trajectory_ids_to_metadata[trajectory_id]["videos/observation.images.wrist/from_timestamp"]
+            episode_data["timestamp"] = episode_data["timestamp"] + from_timestamp  
+            
+            return episode_data
+
+
+    def get_trajectory_index(self, trajectory_id: int) -> int:
+        """Get the index of the trajectory in the dataset by the trajectory ID.
+        This is useful when you need to get the trajectory length or sampling weight corresponding to the trajectory ID.
+
+        Args:
+            trajectory_id (str): The ID of the trajectory.
+
+        Returns:
+            int: The index of the trajectory in the dataset.
+        """
+        trajectory_indices = np.where(self.trajectory_ids == trajectory_id)[0]
+        if len(trajectory_indices) != 1:
+            raise ValueError(
+                f"Error finding trajectory index for {trajectory_id}, found {trajectory_indices=}"
+            )
+        return trajectory_indices[0]
+
+    def get_episode_chunk(self, ep_index: int) -> int:
+        """Get the chunk index for an episode index."""
+        return ep_index // self.chunk_size
+    def get_episode_file_index(self, ep_index: int) -> int:
+        """Get the file index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_index"]
+    
+    def get_episode_file_from_index(self, ep_index: int) -> int:
+        """Get the file from index for an episode index."""
+        episode_meta = self.trajectory_ids_to_metadata[ep_index]
+        return episode_meta["data/file_from_index"]
+
+
+    def retrieve_data_and_pad(
+        self,
+        array: np.ndarray,
+        step_indices: np.ndarray,
+        max_length: int,
+        padding_strategy: str = "first_last",
+    ) -> np.ndarray:
+        """Retrieve the data from the dataset and pad it if necessary.
+        Args:
+            array (np.ndarray): The array to retrieve the data from.
+            step_indices (np.ndarray): The step indices to retrieve the data for.
+            max_length (int): The maximum length of the data.
+            padding_strategy (str): The padding strategy, either "first" or "last".
+        """
+        # Get the padding indices
+        front_padding_indices = step_indices < 0
+        end_padding_indices = step_indices >= max_length
+        padding_positions = np.logical_or(front_padding_indices, end_padding_indices)
+        # Retrieve the data with the non-padding indices
+        # If there exists some padding, Given T step_indices, the shape of the retrieved data will be (T', ...) where T' < T
+        raw_data = array[step_indices[~padding_positions]]
+        assert isinstance(raw_data, np.ndarray), f"{type(raw_data)=}"
+        # This is the shape of the output, (T, ...)
+        if raw_data.ndim == 1:
+            expected_shape = (len(step_indices),)
+        else:
+            expected_shape = (len(step_indices), *array.shape[1:])
+
+        # Pad the data
+        output = np.zeros(expected_shape)
+        # Assign the non-padded data
+        output[~padding_positions] = raw_data
+        # If there exists some padding, pad the data
+        if padding_positions.any():
+            if padding_strategy == "first_last":
+                # Use first / last step data to pad
+                front_padding_data = array[0]
+                end_padding_data = array[-1]
+                output[front_padding_indices] = front_padding_data
+                output[end_padding_indices] = end_padding_data
+            elif padding_strategy == "zero":
+                # Use zero padding
+                output[padding_positions] = 0
+            else:
+                raise ValueError(f"Invalid padding strategy: {padding_strategy}")
+        return output
+
+    def get_video_path(self, trajectory_id: int, key: str) -> Path:
+        chunk_index = self.get_episode_chunk(trajectory_id)
+        original_key = self.lerobot_modality_meta.video[key].original_key
+        if original_key is None:
+            original_key = key
+        if self._lerobot_version == "v2.0":
+            video_filename = self.video_path_pattern.format(
+                episode_chunk=chunk_index, episode_index=trajectory_id, video_key=original_key
+            )
+        elif self._lerobot_version == "v3.0":
+            episode_meta = self.trajectory_ids_to_metadata[trajectory_id]
+            video_filename = self.video_path_pattern.format(
+                video_key=original_key,
+                chunk_index=episode_meta["data/chunk_index"],
+                file_index=episode_meta["data/file_index"],
+            )
+        return self.dataset_path / video_filename
+
+    def get_video(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the video frames for a trajectory by a base index.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (str): The ID of the trajectory.
+            key (str): The key of the video.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The video frames for the trajectory and frame indices. Shape: (T, H, W, C)
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # print(f"{step_indices=}")
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Ensure the indices are within the valid range
+        # This is equivalent to padding the video with extra frames at the beginning and end
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, self.trajectory_lengths[trajectory_index] - 1)
+        assert key.startswith("video."), f"Video key must start with 'video.', got {key}"
+        # Get the sub-key
+        key = key.replace("video.", "")
+        video_path = self.get_video_path(trajectory_id, key)
+        # Get the action/state timestamps for each frame in the video
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert "timestamp" in self.curr_traj_data.columns, f"No timestamp found in {trajectory_id=}"
+        timestamp: np.ndarray = self.curr_traj_data["timestamp"].to_numpy()
+        # Get the corresponding video timestamps from the step indices
+        video_timestamp = timestamp[step_indices]
+
+        return get_frames_by_timestamps(
+            video_path.as_posix(),
+            video_timestamp,
+            video_backend=self.video_backend, # TODO
+            video_backend_kwargs=self.video_backend_kwargs,
+        )
+
+    def get_state_or_action(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ) -> np.ndarray:
+        """Get the state or action data for a trajectory by a base index.
+        If the step indices are out of range, pad with the data:
+            if the data is stored in absolute format, pad with the first or last step data;
+            otherwise, pad with zero.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            np.ndarray: The data for the trajectory and step indices.
+        """
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        assert key.startswith(modality + "."), f"{key} must start with {modality + '.'}, got {key}"
+        # Get the sub-key, e.g. state.joint_angles -> joint_angles
+        key = key.replace(modality + ".", "")
+        # Get the lerobot key
+        le_state_or_action_cfg = getattr(self.lerobot_modality_meta, modality)
+        le_key = le_state_or_action_cfg[key].original_key
+        if le_key is None:
+            le_key = key
+        # Get the data array, shape: (T, D)
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        assert le_key in self.curr_traj_data.columns, f"No {le_key} found in {trajectory_id=}"
+        data_array: np.ndarray = np.stack(self.curr_traj_data[le_key])  # type: ignore
+        assert data_array.ndim == 2, f"Expected 2D array, got key {le_key} is{data_array.shape} array"
+        le_indices = np.arange(
+            le_state_or_action_cfg[key].start,
+            le_state_or_action_cfg[key].end,
+        )
+        data_array = data_array[:, le_indices]
+        # Get the state or action configuration
+        state_or_action_cfg = getattr(self.metadata.modalities, modality)[key]
+
+        # Pad the data
+        return self.retrieve_data_and_pad(
+            array=data_array,
+            step_indices=step_indices,
+            max_length=max_length,
+            padding_strategy="first_last" if state_or_action_cfg.absolute else "zero",
+            # padding_strategy="zero",           # HACK for realdata
+        )
+
+    def get_language(
+        self,
+        trajectory_id: int,
+        key: str,
+        base_index: int,
+    ) -> list[str]:
+        """Get the language annotation data for a trajectory by step indices.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            key (str): The key of the annotation.
+            base_index (int): The base index of the trajectory.
+
+        Returns:
+            list[str]: The annotation data for the trajectory and step indices. If no matching data is found, return empty strings.
+        """
+        assert self.curr_traj_data is not None, f"No data found for {trajectory_id=}"
+        # Get the step indices
+        step_indices = self.delta_indices[key] + base_index
+        # Get the trajectory index
+        trajectory_index = self.get_trajectory_index(trajectory_id)
+        # Get the maximum length of the trajectory
+        max_length = self.trajectory_lengths[trajectory_index]
+        # Get the end times corresponding to the closest indices
+        step_indices = np.maximum(step_indices, 0)
+        step_indices = np.minimum(step_indices, max_length - 1)
+        # Get the annotations
+        task_indices: list[int] = []
+        assert key.startswith(
+            "annotation."
+        ), f"Language key must start with 'annotation.', got {key}"
+        subkey = key.replace("annotation.", "")
+        annotation_meta = self.lerobot_modality_meta.annotation
+        assert annotation_meta is not None, f"Annotation metadata is None for {subkey}"
+        assert (
+            subkey in annotation_meta
+        ), f"Annotation key {subkey} not found in metadata, available annotation keys: {annotation_meta.keys()}"
+        subkey_meta = annotation_meta[subkey]
+        original_key = subkey_meta.original_key
+        if original_key is None:
+            original_key = key
+        for i in range(len(step_indices)): # 
+            # task_indices.append(self.curr_traj_data[original_key][step_indices[i]].item())
+            value = self.curr_traj_data[original_key].iloc[step_indices[i]] # TODO check v2.0 
+            task_indices.append(value if isinstance(value, (int, float)) else value.item())
+
+        return self.tasks.loc[task_indices]["task"].tolist()
+
+    def get_data_by_modality(
+        self,
+        trajectory_id: int,
+        modality: str,
+        key: str,
+        base_index: int,
+    ):
+        """Get the data corresponding to the modality for a trajectory by a base index.
+        This method will call the corresponding helper method based on the modality.
+        See the helper methods for more details.
+        NOTE: For the language modality, the data is padded with empty strings if no matching data is found.
+
+        Args:
+            dataset (BaseSingleDataset): The dataset to retrieve the data from.
+            trajectory_id (int): The ID of the trajectory.
+            modality (str): The modality of the data.
+            key (str): The key of the data.
+            base_index (int): The base index of the trajectory.
+        """
+        if modality == "video":
+            return self.get_video(trajectory_id, key, base_index)
+        elif modality == "state" or modality == "action":
+            return self.get_state_or_action(trajectory_id, modality, key, base_index)
+        elif modality == "language":
+            return self.get_language(trajectory_id, key, base_index)
+        else:
+            raise ValueError(f"Invalid modality: {modality}")
+
+    def _save_dataset_statistics_(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the dataset.
+        Gripper-related keys will be placed at the end.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Get used modality keys
+        used_action_keys, used_state_keys = get_used_modality_keys(self.modality_keys)
+        
+        # Organize statistics by tag
+        tag = self.tag
+        tag_stats = {}
+        
+        # Process action statistics (only for used keys)
+        if hasattr(self.metadata.statistics, 'action') and self.metadata.statistics.action:
+            action_stats = self.metadata.statistics.action
+            
+            # Filter to only include used action keys and reorder: non-gripper first, gripper last
+            non_gripper_keys = []
+            gripper_keys = []
+            
+            for key in action_stats.keys():
+                if key in used_action_keys:
+                    if "gripper" in key.lower():
+                        gripper_keys.append(key)
+                    else:
+                        non_gripper_keys.append(key)
+            
+            # Reorder: non-gripper first, gripper last
+            reordered_keys = non_gripper_keys + gripper_keys
+            
+            filtered_action_stats = {}
+            for key in reordered_keys:
+                filtered_action_stats[key] = action_stats[key]
+            
+            if filtered_action_stats:
+                # Combine statistics from filtered action sub-keys
+                combined_action_stats = combine_modality_stats(filtered_action_stats)
+                
+                # Add mask field based on whether it's gripper or not
+                mask = generate_action_mask_for_used_keys(
+                    self.metadata.modalities.action, filtered_action_stats.keys()
+                )
+                combined_action_stats["mask"] = mask
+                
+                tag_stats["action"] = combined_action_stats
+        
+        # Process state statistics (only for used keys)
+        if hasattr(self.metadata.statistics, 'state') and self.metadata.statistics.state:
+            state_stats = self.metadata.statistics.state
+            
+            # Filter to only include used state keys, optionally reorder gripper to end
+            non_gripper_keys = []
+            gripper_keys = []
+            
+            for key in state_stats.keys():
+                if key in used_state_keys:
+                    if "gripper" in key.lower():
+                        gripper_keys.append(key)
+                    else:
+                        non_gripper_keys.append(key)
+            
+            # Reorder: non-gripper first, gripper last
+            reordered_keys = non_gripper_keys + gripper_keys
+            
+            filtered_state_stats = {}
+            for key in reordered_keys:
+                filtered_state_stats[key] = state_stats[key]
+            
+            if filtered_state_stats:
+                combined_state_stats = combine_modality_stats(filtered_state_stats)
+                tag_stats["state"] = combined_state_stats
+        
+        # Add dataset counts
+        tag_stats["num_transitions"] = len(self)
+        tag_stats["num_trajectories"] = len(self.trajectory_ids)
+        
+        statistics_data[tag] = tag_stats
+        
+        # Save as JSON file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Single dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(used_action_keys)}")
+        print(f"Used state keys (reordered): {list(used_state_keys)}")
+
+
+
+class MixtureSpecElement(BaseModel):
+    dataset_path: list[Path] | Path = Field(..., description="The path to the dataset.")
+    dataset_weight: float = Field(..., description="The weight of the dataset in the mixture.")
+    distribute_weights: bool = Field(
+        default=False,
+        description="Whether to distribute the weights of the dataset across all the paths. If True, the weights will be evenly distributed across all the paths.",
+    )
+
+
+# Helper functions for dataset statistics
+
+def combine_modality_stats(modality_stats: dict) -> dict:
+    """
+    Combine statistics from all sub-keys under a modality.
+    
+    Args:
+        modality_stats (dict): Statistics for a modality, containing multiple sub-keys.
+                               Each sub-key contains DatasetStatisticalValues object.
+        
+    Returns:
+        dict: Combined statistics
+    """
+    combined_stats = {
+        "mean": [],
+        "std": [],
+        "max": [],
+        "min": [],
+        "q01": [],
+        "q99": []
+    }
+    
+    # Combine statistics in sub-key order
+    for subkey in modality_stats.keys():
+        subkey_stats = modality_stats[subkey]  # This is a DatasetStatisticalValues object
+        
+        # Convert DatasetStatisticalValues to dict-like access
+        for stat_name in ["mean", "std", "max", "min", "q01", "q99"]:
+            stat_value = getattr(subkey_stats, stat_name)
+            if isinstance(stat_value, (list, tuple)):
+                combined_stats[stat_name].extend(stat_value)
+            else:
+                # Handle NDArray case - convert to list
+                if hasattr(stat_value, 'tolist'):
+                    combined_stats[stat_name].extend(stat_value.tolist())
+                else:
+                    combined_stats[stat_name].append(float(stat_value))
+    
+    return combined_stats
+
+def generate_action_mask_for_used_keys(action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+    """
+    Generate mask based on action modalities, but only for used keys.
+    Gripper-related are False, others are True.
+    
+    Args:
+        action_modalities (dict): Configuration information for action modalities.
+        used_action_keys_ordered: Iterable of actually used action keys in the correct order.
+        
+    Returns:
+        list[bool]: List of mask values
+    """
+    mask = []
+    
+    # Generate mask in the same order as the statistics were combined
+    for subkey in used_action_keys_ordered:
+        if subkey in action_modalities:
+            subkey_config = action_modalities[subkey]
+            
+            # Get dimension count from shape
+            if hasattr(subkey_config, 'shape') and len(subkey_config.shape) > 0:
+                dim_count = subkey_config.shape[0]
+            else:
+                dim_count = 1
+            
+            # Check if it's gripper-related
+            is_gripper = "gripper" in subkey.lower()
+            
+            # Generate mask value for each dimension
+            for _ in range(dim_count):
+                mask.append(not is_gripper)  # gripper is False, others are True
+    
+    return mask
+
+def get_used_modality_keys(modality_keys: dict) -> tuple[set, set]:
+    """Extract used action and state keys from modality configuration."""
+    used_action_keys = []
+    used_state_keys = []
+    
+    # Extract action keys (remove "action." prefix)
+    for action_key in modality_keys.get("action", []):
+        if action_key.startswith("action."):
+            clean_key = action_key.replace("action.", "")
+            used_action_keys.append(clean_key)
+    
+    # Extract state keys (remove "state." prefix)  
+    for state_key in modality_keys.get("state", []):
+        if state_key.startswith("state."):
+            clean_key = state_key.replace("state.", "")
+            used_state_keys.append(clean_key)
+    
+    return used_action_keys, used_state_keys
+
+
+def safe_hash(input_tuple):
+    # keep 128 bits of the hash
+    tuple_string = repr(input_tuple).encode("utf-8")
+    sha256 = hashlib.sha256()
+    sha256.update(tuple_string)
+
+    seed = int(sha256.hexdigest(), 16)
+
+    return seed & 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF
+    
+
+class LeRobotMixtureDataset(Dataset):
+    """
+    A mixture of multiple datasets. This class samples a single dataset based on the dataset weights and then calls the `__getitem__` method of the sampled dataset.
+    It is recommended to modify the single dataset class instead of this class.
+    """
+
+    def __init__(
+        self,
+        data_mixture: Sequence[tuple[LeRobotSingleDataset, float]],
+        mode: str,
+        balance_dataset_weights: bool = True,
+        balance_trajectory_weights: bool = True,
+        seed: int = 42,
+        metadata_config: dict = {
+            "percentile_mixing_method": "min_max",
+        },
+        **kwargs,
+    ):
+        """
+        Initialize the mixture dataset.
+
+        Args:
+            data_mixture (list[tuple[LeRobotSingleDataset, float]]): Datasets and their corresponding weights.
+            mode (str): If "train", __getitem__ will return different samples every epoch; if "val" or "test", __getitem__ will return the same sample every epoch.
+            balance_dataset_weights (bool): If True, the weight of dataset will be multiplied by the total trajectory length of each dataset.
+            balance_trajectory_weights (bool): If True, sample trajectories within a dataset weighted by their length; otherwise, use equal weighting.
+            seed (int): Random seed for sampling.
+        """
+        datasets: list[LeRobotSingleDataset] = []
+        dataset_sampling_weights: list[float] = []
+        for dataset, weight in data_mixture:
+            # Check if dataset is valid and has data
+            if len(dataset) == 0:
+                print(f"Warning: Skipping empty dataset {dataset.dataset_name}")
+                continue
+            datasets.append(dataset)
+            dataset_sampling_weights.append(weight)
+        
+        if len(datasets) == 0:
+            raise ValueError("No valid datasets found in the mixture. All datasets are empty.")
+        
+        self.datasets = datasets
+        self.balance_dataset_weights = balance_dataset_weights
+        self.balance_trajectory_weights = balance_trajectory_weights
+        self.seed = seed
+        self.mode = mode
+
+        # Set properties for sampling
+
+        # 1. Dataset lengths
+        self._dataset_lengths = np.array([len(dataset) for dataset in self.datasets])
+        print(f"Dataset lengths: {self._dataset_lengths}")
+
+        # 2. Dataset sampling weights
+        self._dataset_sampling_weights = np.array(dataset_sampling_weights)
+        
+        if self.balance_dataset_weights:
+            self._dataset_sampling_weights *= self._dataset_lengths
+        
+        # Check for zero or negative weights before normalization
+        if np.any(self._dataset_sampling_weights <= 0):
+            print(f"Warning: Found zero or negative sampling weights: {self._dataset_sampling_weights}")
+            # Set minimum weight to prevent division issues
+            self._dataset_sampling_weights = np.maximum(self._dataset_sampling_weights, 1e-8)
+        
+        # Normalize weights
+        weights_sum = self._dataset_sampling_weights.sum()
+        if weights_sum == 0 or np.isnan(weights_sum):
+            print(f"Error: Invalid weights sum: {weights_sum}")
+            # Fallback to equal weights
+            self._dataset_sampling_weights = np.ones(len(self.datasets)) / len(self.datasets)
+            print(f"Fallback to equal weights")
+        else:
+            self._dataset_sampling_weights /= weights_sum
+
+        # 3. Trajectory sampling weights
+        self._trajectory_sampling_weights: list[np.ndarray] = []
+        for i, dataset in enumerate(self.datasets):
+            trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths))
+            if self.balance_trajectory_weights:
+                trajectory_sampling_weights *= dataset.trajectory_lengths
+            
+            # Check for zero or negative weights before normalization
+            if np.any(trajectory_sampling_weights <= 0):
+                print(f"Warning: Dataset {i} has zero or negative trajectory weights")
+                trajectory_sampling_weights = np.maximum(trajectory_sampling_weights, 1e-8)
+            
+            # Normalize weights
+            weights_sum = trajectory_sampling_weights.sum()
+            if weights_sum == 0 or np.isnan(weights_sum):
+                print(f"Error: Dataset {i} has invalid trajectory weights sum: {weights_sum}")
+                # Fallback to equal weights
+                trajectory_sampling_weights = np.ones(len(dataset.trajectory_lengths)) / len(dataset.trajectory_lengths)
+            else:
+                trajectory_sampling_weights /= weights_sum
+            
+            self._trajectory_sampling_weights.append(trajectory_sampling_weights)
+
+        # 4. Primary dataset indices
+        self._primary_dataset_indices = np.array(dataset_sampling_weights) == 1.0
+        if not np.any(self._primary_dataset_indices):
+            print(f"Warning: No dataset with weight 1.0 found. Original weights: {dataset_sampling_weights}")
+            # Fallback: use the dataset(s) with maximum weight as primary
+            max_weight = max(dataset_sampling_weights)
+            self._primary_dataset_indices = np.array(dataset_sampling_weights) == max_weight
+            print(f"Using datasets with maximum weight {max_weight} as primary: {self._primary_dataset_indices}")
+            
+        if not np.any(self._primary_dataset_indices):
+            # This should never happen, but just in case
+            print("Error: Still no primary dataset found. Using first dataset as primary.")
+            self._primary_dataset_indices = np.zeros(len(self.datasets), dtype=bool)
+            self._primary_dataset_indices[0] = True
+
+        # Set the epoch and sample the first epoch
+        self.set_epoch(0)
+
+        self.update_metadata(metadata_config)
+
+    @property
+    def dataset_lengths(self) -> np.ndarray:
+        """The lengths of each dataset."""
+        return self._dataset_lengths
+
+    @property
+    def dataset_sampling_weights(self) -> np.ndarray:
+        """The sampling weights for each dataset."""
+        return self._dataset_sampling_weights
+
+    @property
+    def trajectory_sampling_weights(self) -> list[np.ndarray]:
+        """The sampling weights for each trajectory in each dataset."""
+        return self._trajectory_sampling_weights
+
+    @property
+    def primary_dataset_indices(self) -> np.ndarray:
+        """The indices of the primary datasets."""
+        return self._primary_dataset_indices
+
+    def __str__(self) -> str:
+        dataset_descriptions = []
+        for dataset, weight in zip(self.datasets, self.dataset_sampling_weights):
+            dataset_description = {
+                "Dataset": str(dataset),
+                "Sampling weight": float(weight),
+            }
+            dataset_descriptions.append(dataset_description)
+        return json.dumps({"Mixture dataset": dataset_descriptions}, indent=2)
+
+    def set_epoch(self, epoch: int):
+        """Set the epoch for the dataset.
+
+        Args:
+            epoch (int): The epoch to set.
+        """
+        self.epoch = epoch
+        # self.sampled_steps = self.sample_epoch()
+
+    def sample_step(self, index: int) -> tuple[LeRobotSingleDataset, int, int]:
+        """Sample a single step from the dataset."""
+        # return self.sampled_steps[index]
+
+        # Set seed
+        seed = index if self.mode != "train" else safe_hash((self.epoch, index, self.seed))
+        rng = np.random.default_rng(seed)
+
+        # Sample dataset
+        dataset_index = rng.choice(len(self.datasets), p=self.dataset_sampling_weights)
+        dataset = self.datasets[dataset_index]
+
+        # Sample trajectory
+        # trajectory_index = rng.choice(
+        #     len(dataset.trajectory_ids), p=self.trajectory_sampling_weights[dataset_index]
+        # )
+        # trajectory_id = dataset.trajectory_ids[trajectory_index]
+
+        # # Sample step
+        # base_index = rng.choice(dataset.trajectory_lengths[trajectory_index])
+        # return dataset, trajectory_id, base_index
+        single_step_index = rng.choice(len(dataset.all_steps))
+        trajectory_id, base_index = dataset.all_steps[single_step_index]
+        return dataset, trajectory_id, base_index
+
+    def __getitem__(self, index: int) -> dict:
+        """Get the data for a single trajectory and start index.
+
+        Args:
+            index (int): The index of the trajectory to get.
+
+        Returns:
+            dict: The data for the trajectory and start index.
+        """
+        max_retries = 10
+        last_exception = None
+        
+        for attempt in range(max_retries):
+            try:
+                dataset, trajectory_name, step = self.sample_step(index)
+                data_raw = dataset.get_step_data(trajectory_name, step)
+                data = dataset.transforms(data_raw)
+                
+                # Process all video keys dynamically
+                images = []
+                for video_key in dataset.modality_keys["video"]:
+                    image = data[video_key][0]
+                    
+                    image = Image.fromarray(image).resize((224, 224)) #TODO check if this is ok
+                    images.append(image)
+                
+                # Get language and action data
+                language = data[dataset.modality_keys["language"][0]][0]
+                action = []
+                for action_key in dataset.modality_keys["action"]:
+                    action.append(data[action_key])
+                action = np.concatenate(action, axis=1).astype(np.float16)
+                action = standardize_action_representation(action, dataset.tag)
+                
+                state = []
+                for state_key in dataset.modality_keys["state"]:
+                    state.append(data[state_key])
+                state = np.concatenate(state, axis=1).astype(np.float16)
+                state = standardize_state_representation(state, dataset.tag)
+                
+                return dict(action=action, state=state, image=images, lang=language, dataset_id=dataset._dataset_id)
+                
+            except Exception as e:
+                last_exception = e
+                if attempt < max_retries - 1:
+                    # Log the error but continue trying
+                    print(f"Attempt {attempt + 1}/{max_retries} failed for index {index}: {e}")
+                    print(f"Retrying with new sample...")
+                    # For retry, we can use a slightly different index to get a new sample
+                    # This helps avoid getting stuck on the same problematic sample
+                    index = random.randint(0, len(self) - 1)
+                else:
+                    # All retries exhausted
+                    print(f"All {max_retries} attempts failed for index {index}")
+                    print(f"Last error: {last_exception}")
+                    # Return a dummy sample or re-raise the exception
+                    raise last_exception
+
+    def __len__(self) -> int:
+        """Get the length of a single epoch in the mixture.
+
+        Returns:
+            int: The length of a single epoch in the mixture.
+        """
+        # Check for potential issues
+        if len(self.datasets) == 0:
+            return 0
+            
+        # Check if any dataset lengths are 0 or NaN
+        if np.any(self.dataset_lengths == 0) or np.any(np.isnan(self.dataset_lengths)):
+            print(f"Warning: Found zero or NaN dataset lengths: {self.dataset_lengths}")
+            # Filter out zero/NaN length datasets
+            valid_indices = (self.dataset_lengths > 0) & (~np.isnan(self.dataset_lengths))
+            if not np.any(valid_indices):
+                print("Error: All datasets have zero or NaN length")
+                return 0
+        else:
+            valid_indices = np.ones(len(self.datasets), dtype=bool)
+        
+        # Check if any sampling weights are 0 or NaN
+        if np.any(self.dataset_sampling_weights == 0) or np.any(np.isnan(self.dataset_sampling_weights)):
+            print(f"Warning: Found zero or NaN sampling weights: {self.dataset_sampling_weights}")
+            # Use only valid weights
+            valid_weights = (self.dataset_sampling_weights > 0) & (~np.isnan(self.dataset_sampling_weights))
+            valid_indices = valid_indices & valid_weights
+            if not np.any(valid_indices):
+                print("Error: All sampling weights are zero or NaN")
+                return 0
+        
+        # Check primary dataset indices
+        primary_and_valid = self.primary_dataset_indices & valid_indices
+        if not np.any(primary_and_valid):
+            print(f"Warning: No valid primary datasets found. Primary indices: {self.primary_dataset_indices}, Valid indices: {valid_indices}")
+            # Fallback: use the largest valid dataset
+            if np.any(valid_indices):
+                max_length = self.dataset_lengths[valid_indices].max()
+                print(f"Fallback: Using maximum dataset length: {max_length}")
+                return int(max_length)
+            else:
+                return 0
+        
+        # Calculate the ratio and get max
+        ratios = (self.dataset_lengths / self.dataset_sampling_weights)[primary_and_valid]
+        
+        # Check for NaN or inf in ratios
+        if np.any(np.isnan(ratios)) or np.any(np.isinf(ratios)):
+            print(f"Warning: Found NaN or inf in ratios: {ratios}")
+            print(f"Dataset lengths: {self.dataset_lengths[primary_and_valid]}")
+            print(f"Sampling weights: {self.dataset_sampling_weights[primary_and_valid]}")
+            # Filter out invalid ratios
+            valid_ratios = ratios[~np.isnan(ratios) & ~np.isinf(ratios)]
+            if len(valid_ratios) == 0:
+                print("Error: All ratios are NaN or inf")
+                return 0
+            max_ratio = valid_ratios.max()
+        else:
+            max_ratio = ratios.max()
+        
+        result = int(max_ratio)
+        if result == 0:
+            print(f"Warning: Dataset mixture length is 0")
+        return result
+
+    @staticmethod
+    def compute_overall_statistics(
+        per_task_stats: list[dict[str, dict[str, list[float] | np.ndarray]]],
+        dataset_sampling_weights: list[float] | np.ndarray,
+        percentile_mixing_method: str = "weighted_average",
+    ) -> dict[str, dict[str, list[float]]]:
+        """
+        Computes overall statistics from per-task statistics using dataset sample weights.
+
+        Args:
+            per_task_stats: List of per-task statistics.
+            Example format of one element in the per-task statistics list:
+                {
+                    "state.gripper": {
+                        "min": [...],
+                        "max": [...],
+                        "mean": [...],
+                        "std": [...],
+                        "q01": [...],
+                        "q99": [...],
+                    },
+                    ...
+                }
+            dataset_sampling_weights: List of sample weights for each task.
+            percentile_mixing_method: The method to mix the percentiles, either "weighted_average" or "weighted_std".
+
+        Returns:
+            A dict of overall statistics per modality.
+        """
+        # Normalize the sample weights to sum to 1
+        dataset_sampling_weights = np.array(dataset_sampling_weights)
+        normalized_weights = dataset_sampling_weights / dataset_sampling_weights.sum()
+
+        # Initialize overall statistics dict
+        overall_stats: dict[str, dict[str, list[float]]] = {}
+
+        # Get the list of modality keys
+        modality_keys = per_task_stats[0].keys()
+
+        for modality in modality_keys:
+            # Number of dimensions (assuming consistent across tasks)
+            num_dims = len(per_task_stats[0][modality]["mean"])
+
+            # Initialize accumulators for means and variances
+            weighted_means = np.zeros(num_dims)
+            weighted_squares = np.zeros(num_dims)
+
+            # Collect min, max, q01, q99 from all tasks
+            min_list = []
+            max_list = []
+            q01_list = []
+            q99_list = []
+
+            for task_idx, task_stats in enumerate(per_task_stats):
+                w_i = normalized_weights[task_idx]
+                stats = task_stats[modality]
+                means = np.array(stats["mean"])
+                stds = np.array(stats["std"])
+
+                # Update weighted sums for mean and variance
+                weighted_means += w_i * means
+                weighted_squares += w_i * (stds**2 + means**2)
+
+                # Collect min, max, q01, q99
+                min_list.append(stats["min"])
+                max_list.append(stats["max"])
+                q01_list.append(stats["q01"])
+                q99_list.append(stats["q99"])
+
+            # Compute overall mean
+            overall_mean = weighted_means.tolist()
+
+            # Compute overall variance and std deviation
+            overall_variance = weighted_squares - weighted_means**2
+            overall_std = np.sqrt(overall_variance).tolist()
+
+            # Compute overall min and max per dimension
+            overall_min = np.min(np.array(min_list), axis=0).tolist()
+            overall_max = np.max(np.array(max_list), axis=0).tolist()
+
+            # Compute overall q01 and q99 per dimension
+            # Use weighted average of per-task quantiles
+            q01_array = np.array(q01_list)
+            q99_array = np.array(q99_list)
+            if percentile_mixing_method == "weighted_average":
+                weighted_q01 = np.average(q01_array, axis=0, weights=normalized_weights).tolist()
+                weighted_q99 = np.average(q99_array, axis=0, weights=normalized_weights).tolist()
+                # std_q01 = np.std(q01_array, axis=0).tolist()
+                # std_q99 = np.std(q99_array, axis=0).tolist()
+                # print(modality)
+                # print(f"{std_q01=}, {std_q99=}")
+                # print(f"{weighted_q01=}, {weighted_q99=}")
+            elif percentile_mixing_method == "min_max":
+                weighted_q01 = np.min(q01_array, axis=0).tolist()
+                weighted_q99 = np.max(q99_array, axis=0).tolist()
+            else:
+                raise ValueError(f"Invalid percentile mixing method: {percentile_mixing_method}")
+
+            # Store the overall statistics for the modality
+            overall_stats[modality] = {
+                "min": overall_min,
+                "max": overall_max,
+                "mean": overall_mean,
+                "std": overall_std,
+                "q01": weighted_q01,
+                "q99": weighted_q99,
+            }
+
+        return overall_stats
+
+    @staticmethod
+    def merge_metadata(
+        metadatas: list[DatasetMetadata],
+        dataset_sampling_weights: list[float],
+        percentile_mixing_method: str,
+    ) -> DatasetMetadata:
+        """Merge multiple metadata into one."""
+        # Convert to dicts
+        metadata_dicts = [metadata.model_dump(mode="json") for metadata in metadatas]
+        # Create a new metadata dict
+        merged_metadata = {}
+
+        # Check all metadata have the same embodiment tag
+        assert all(
+            metadata.embodiment_tag == metadatas[0].embodiment_tag for metadata in metadatas
+        ), "All metadata must have the same embodiment tag"
+        merged_metadata["embodiment_tag"] = metadatas[0].embodiment_tag
+
+        # Merge the dataset statistics
+        dataset_statistics = {}
+        dataset_statistics["state"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["state"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        dataset_statistics["action"] = LeRobotMixtureDataset.compute_overall_statistics(
+            per_task_stats=[m["statistics"]["action"] for m in metadata_dicts],
+            dataset_sampling_weights=dataset_sampling_weights,
+            percentile_mixing_method=percentile_mixing_method,
+        )
+        merged_metadata["statistics"] = dataset_statistics
+
+        # Merge the modality configs
+        modality_configs = defaultdict(set)
+        for metadata in metadata_dicts:
+            for modality, configs in metadata["modalities"].items():
+                modality_configs[modality].add(json.dumps(configs))
+        merged_metadata["modalities"] = {}
+        for modality, configs in modality_configs.items():
+            # Check that all modality configs correspond to the same tag matches
+            assert (
+                len(configs) == 1
+            ), f"Multiple modality configs for modality {modality}: {list(configs)}"
+            merged_metadata["modalities"][modality] = json.loads(configs.pop())
+
+        return DatasetMetadata.model_validate(merged_metadata)
+
+    def update_metadata(self, metadata_config: dict, cached_statistics_path: Path | str | None = None) -> None:
+        """
+        Merge multiple metadatas into one and set the transforms with the merged metadata.
+
+        Args:
+            metadata_config (dict): Configuration for the metadata.
+                "percentile_mixing_method": The method to mix the percentiles, either "weighted_average" or "min_max".
+                    weighted_average: Use the weighted average of the percentiles using the weight used in sampling the datasets.
+                    min_max: Use the min of the 1st percentile and max of the 99th percentile.
+        """
+        # If cached path is provided, try to load and apply
+        if cached_statistics_path is not None:
+            try:
+                cached_stats = self.load_merged_statistics(cached_statistics_path)
+                self.apply_cached_statistics(cached_stats)
+                return
+            except (FileNotFoundError, KeyError, ValidationError) as e:
+                print(f"Failed to load cached statistics: {e}")
+                print("Falling back to computing statistics from scratch...")
+
+        self.tag = EmbodimentTag.NEW_EMBODIMENT.value
+        self.merged_metadata: dict[str, DatasetMetadata] = {}
+        # Group metadata by tag
+        all_metadatas: dict[str, list[DatasetMetadata]] = {}
+        for dataset in self.datasets:
+            if dataset.tag not in all_metadatas:
+                all_metadatas[dataset.tag] = []
+            all_metadatas[dataset.tag].append(dataset.metadata)
+        for tag, metadatas in all_metadatas.items():
+            self.merged_metadata[tag] = self.merge_metadata(
+                metadatas=metadatas,
+                dataset_sampling_weights=self.dataset_sampling_weights.tolist(),
+                percentile_mixing_method=metadata_config["percentile_mixing_method"],
+            )
+        for dataset in self.datasets:
+            dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+
+    def save_dataset_statistics(self, save_path: Path | str, format: str = "json") -> None:
+        """
+        Save merged dataset statistics to specified path in the required format.
+        Only includes statistics for keys that are actually used in the datasets.
+        Gripper-related keys will be placed at the end.
+        
+        Args:
+            save_path (Path | str): Path to save the statistics file
+            format (str): Save format, currently only supports "json"
+        """
+        save_path = Path(save_path)
+        save_path.parent.mkdir(parents=True, exist_ok=True)
+        
+        # Build the data structure to save
+        statistics_data = {}
+        
+        # Collect actually used keys from all datasets
+        all_used_action_keys = []
+        all_used_state_keys = []
+        
+        for dataset in self.datasets:
+            used_action_keys, used_state_keys = get_used_modality_keys(dataset.modality_keys)
+            for used_action_key in used_action_keys:
+                if used_action_key not in all_used_action_keys:
+                    all_used_action_keys.append(used_action_key)
+            for used_state_key in used_state_keys:
+                if used_state_key not in all_used_state_keys:
+                    all_used_state_keys.append(used_state_key)
+        
+        # Organize statistics by tag
+        for tag, merged_metadata in self.merged_metadata.items():
+            tag_stats = {}
+            
+            # Process action statistics
+            if hasattr(merged_metadata.statistics, 'action') and merged_metadata.statistics.action:
+                action_stats = merged_metadata.statistics.action
+                
+                # Filter and reorder keys - iterate in all_used_action_keys order
+                non_gripper_keys = []
+                gripper_keys = []
+                
+                for key in all_used_action_keys:
+                    if key in action_stats:
+                        non_gripper_keys.append(key)
+                
+                reordered_keys = non_gripper_keys + gripper_keys
+                
+                filtered_action_stats = {}
+                for key in reordered_keys:
+                    filtered_action_stats[key] = action_stats[key]
+                
+                if filtered_action_stats:
+                    combined_action_stats = combine_modality_stats(filtered_action_stats)
+                    
+                    mask = generate_action_mask_for_used_keys(
+                        merged_metadata.modalities.action, filtered_action_stats.keys()
+                    )
+                    combined_action_stats["mask"] = mask
+                    
+                    tag_stats["action"] = combined_action_stats
+            
+            # Process state statistics
+            if hasattr(merged_metadata.statistics, 'state') and merged_metadata.statistics.state:
+                state_stats = merged_metadata.statistics.state
+                
+                # Filter and reorder keys - iterate in all_used_state_keys order
+                non_gripper_keys = []
+                gripper_keys = []
+                
+                for key in all_used_state_keys:
+                    if key in state_stats:
+                        non_gripper_keys.append(key)
+                
+                reordered_keys = non_gripper_keys + gripper_keys
+                
+                filtered_state_stats = {}
+                for key in reordered_keys:
+                    filtered_state_stats[key] = state_stats[key]
+                
+                if filtered_state_stats:
+                    combined_state_stats = combine_modality_stats(filtered_state_stats)
+                    tag_stats["state"] = combined_state_stats
+            
+            # Add dataset counts
+            tag_stats.update(self._get_dataset_counts(tag))
+            
+            statistics_data[tag] = tag_stats
+        
+        # Save file
+        if format.lower() == "json":
+            if not str(save_path).endswith('.json'):
+                save_path = save_path.with_suffix('.json')
+            with open(save_path, 'w', encoding='utf-8') as f:
+                json.dump(statistics_data, f, indent=2, ensure_ascii=False)
+        else:
+            raise ValueError(f"Unsupported format: {format}. Currently only 'json' is supported.")
+        
+        print(f"Merged dataset statistics saved to: {save_path}")
+        print(f"Used action keys (reordered): {list(all_used_action_keys)}")
+        print(f"Used state keys (reordered): {list(all_used_state_keys)}")
+
+
+    def _combine_modality_stats(self, modality_stats: dict) -> dict:
+        """Backward compatibility wrapper."""
+        return combine_modality_stats(modality_stats)
+
+    def _generate_action_mask_for_used_keys(self, action_modalities: dict, used_action_keys_ordered) -> list[bool]:
+        """Backward compatibility wrapper."""
+        return generate_action_mask_for_used_keys(action_modalities, used_action_keys_ordered)
+
+    def _get_dataset_counts(self, tag: str) -> dict:
+        """
+        Get dataset count information for specified tag.
+        
+        Args:
+            tag (str): embodiment tag
+            
+        Returns:
+            dict: Dictionary containing num_transitions and num_trajectories
+        """
+        num_transitions = 0
+        num_trajectories = 0
+        
+        # Count dataset information belonging to this tag
+        for dataset in self.datasets:
+            if dataset.tag == tag:
+                num_transitions += len(dataset)
+                num_trajectories += len(dataset.trajectory_ids)
+        
+        return {
+            "num_transitions": num_transitions,
+            "num_trajectories": num_trajectories
+        }
+
+    @classmethod
+    def load_merged_statistics(cls, load_path: Path | str) -> dict:
+        """
+        Load merged dataset statistics from file.
+        
+        Args:
+            load_path (Path | str): Path to the statistics file
+            
+        Returns:
+            dict: Dictionary containing merged statistics
+        """
+        load_path = Path(load_path)
+        if not load_path.exists():
+            raise FileNotFoundError(f"Statistics file not found: {load_path}")
+        
+        if load_path.suffix.lower() == '.json':
+            with open(load_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        elif load_path.suffix.lower() == '.pkl':
+            import pickle
+            with open(load_path, 'rb') as f:
+                return pickle.load(f)
+        else:
+            raise ValueError(f"Unsupported file format: {load_path.suffix}")
+
+    def apply_cached_statistics(self, cached_statistics: dict) -> None:
+        """
+        Apply cached statistics to avoid recomputation.
+        
+        Args:
+            cached_statistics (dict): Statistics loaded from file
+        """
+        # Validate that cached statistics match current datasets
+        if "metadata" in cached_statistics:
+            cached_dataset_names = set(cached_statistics["metadata"]["dataset_names"])
+            current_dataset_names = set(dataset.dataset_name for dataset in self.datasets)
+            
+            if cached_dataset_names != current_dataset_names:
+                print("Warning: Cached statistics dataset names don't match current datasets.")
+                print(f"Cached: {cached_dataset_names}")
+                print(f"Current: {current_dataset_names}")
+                return
+        
+        # Apply cached statistics
+        self.merged_metadata = {}
+        for tag, stats_data in cached_statistics.items():
+            if tag == "metadata":  # Skip metadata field
+                continue
+                
+            # Convert back to DatasetMetadata format
+            metadata_dict = {
+                "embodiment_tag": tag,
+                "statistics": {
+                    "action": {},
+                    "state": {}
+                },
+                "modalities": {}
+            }
+            
+            # Convert action statistics back
+            if "action" in stats_data:
+                action_data = stats_data["action"]
+                # This is simplified - you may need to split back to sub-keys
+                metadata_dict["statistics"]["action"] = action_data
+            
+            # Convert state statistics back
+            if "state" in stats_data:
+                state_data = stats_data["state"]
+                metadata_dict["statistics"]["state"] = state_data
+            
+            self.merged_metadata[tag] = DatasetMetadata.model_validate(metadata_dict)
+        
+        # Update transforms metadata for each dataset
+        for dataset in self.datasets:
+            if dataset.tag in self.merged_metadata:
+                dataset.set_transforms_metadata(self.merged_metadata[dataset.tag])
+        
+        print(f"Applied cached statistics for {len(self.merged_metadata)} embodiment tags.")
+
diff --git a/code/dataloader_bak/gr00t_lerobot/embodiment_tags.py b/code/dataloader_bak/gr00t_lerobot/embodiment_tags.py
new file mode 100644
index 0000000000000000000000000000000000000000..73dd5b4b327fcc9f7534ab00d31d6eb07b0912d7
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/embodiment_tags.py
@@ -0,0 +1,196 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+
+
+class EmbodimentTag(Enum):
+    GR1 = "gr1"
+    """
+    The GR1 dataset.
+    """
+
+    OXE_DROID = "oxe_droid"
+    """
+    The OxE Droid dataset.
+    """
+
+    OXE_BRIDGE = "oxe_bridge"
+    """
+    The OxE Bridge dataset.
+    """
+
+    OXE_RT1 = "oxe_rt1"
+    """
+    The OxE RT-1 dataset.
+    """
+
+    AGIBOT_GENIE1 = "agibot_genie1"
+    """
+    The AgiBot Genie-1 with gripper dataset.
+    """
+
+    NEW_EMBODIMENT = "new_embodiment"
+    """
+    Any new embodiment for finetuning.
+    """
+
+    FRANKA = 'franka'
+    """
+    The Franka Emika Panda robot.
+    """
+
+    ROBOTWIN = "robotwin"
+    """
+    RobotWin (dual-arm) datasets.
+    """
+
+    REAL_WORLD_FRANKA = "real_world_franka"
+    """
+    The Real-World Franka robot.
+    """
+
+# Embodiment tag string: to projector index in the Action Expert Module
+# EMBODIMENT_TAG_MAPPING = {
+#     EmbodimentTag.NEW_EMBODIMENT.value: 31,
+#     EmbodimentTag.OXE_DROID.value: 17,
+#     EmbodimentTag.OXE_BRIDGE.value: 18,
+#     EmbodimentTag.OXE_RT1.value: 19,
+#     EmbodimentTag.AGIBOT_GENIE1.value: 26,
+#     EmbodimentTag.GR1.value: 24,
+#     EmbodimentTag.FRANKA.value: 25,
+#     EmbodimentTag.ROBOTWIN.value: 27,
+#     EmbodimentTag.REAL_WORLD_FRANKA.value: 28,
+# }
+
+# Robot type to embodiment tag mapping
+ROBOT_TYPE_TO_EMBODIMENT_TAG = {
+    "libero_franka": EmbodimentTag.FRANKA,
+    "oxe_droid": EmbodimentTag.OXE_DROID,
+    "oxe_bridge": EmbodimentTag.OXE_BRIDGE,
+    "oxe_rt1": EmbodimentTag.OXE_RT1,
+    "demo_sim_franka_delta_joints": EmbodimentTag.FRANKA,
+    "custom_robot_config": EmbodimentTag.NEW_EMBODIMENT,
+    "fourier_gr1_arms_waist": EmbodimentTag.GR1,
+    "robotwin": EmbodimentTag.ROBOTWIN,
+    "real_world_franka": EmbodimentTag.REAL_WORLD_FRANKA,
+    }
+
+DATASET_NAME_TO_ID = {
+    # Libero Datasets
+    "libero_object_no_noops_1.0.0_lerobot": 1,
+    "libero_goal_no_noops_1.0.0_lerobot": 1,
+    "libero_spatial_no_noops_1.0.0_lerobot": 1,
+    "libero_10_no_noops_1.0.0_lerobot": 1,
+    "libero_90_no_noops_lerobot": 1,
+
+    # OXE Datasets
+    "bridge_orig_lerobot": 2,
+    "fractal20220817_data_lerobot": 3,
+    "droid_lerobot": 4,
+    "furniture_bench_dataset_lerobot": 5,
+    "taco_play_lerobot": 6,
+
+    # RoboCasa Datasets
+    "gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPCanToDrawerClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPCupToDrawerClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPMilkToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPPotatoToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPWineToCabinetClose_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToPanSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToPotSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBowlSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToPanSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToPlateSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToPotSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000": 7,
+    "gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    "gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200": 7,
+    
+    # robotwin
+    "adjust_bottle": 8,
+    "beat_block_hammer": 8,
+    "blocks_ranking_rgb": 8,
+    "blocks_ranking_size": 8,
+    "click_alarmclock": 8,
+    "click_bell": 8,
+    "dump_bin_bigbin": 8,
+    "grab_roller": 8,
+    "handover_block": 8,
+    "handover_mic": 8,
+    "hanging_mug": 8,
+    "lift_pot": 8,
+    "move_can_pot": 8,
+    "move_pillbottle_pad": 8,
+    "move_playingcard_away": 8,
+    "move_stapler_pad": 8,
+    "open_laptop": 8,
+    "open_microwave": 8,
+    "pick_diverse_bottles": 8,
+    "pick_dual_bottles": 8,
+    "place_a2b_left": 8,
+    "place_a2b_right": 8,
+    "place_bread_basket": 8,
+    "place_bread_skillet": 8,
+    "place_burger_fries": 8,
+    "place_can_basket": 8,
+    "place_cans_plasticbox": 8,
+    "place_container_plate": 8,
+    "place_dual_shoes": 8,
+    "place_empty_cup": 8,
+    "place_fan": 8,
+    "place_mouse_pad": 8,
+    "place_object_basket": 8,
+    "place_object_scale": 8,
+    "place_object_stand": 8,
+    "place_phone_stand": 8,
+    "place_shoe": 8,
+    "press_stapler": 8,
+    "put_bottles_dustbin": 8,
+    "put_object_cabinet": 8,
+    "rotate_qrcode": 8,
+    "scan_object": 8,
+    "shake_bottle_horizontally": 8,
+    "shake_bottle": 8,
+    "stack_blocks_three": 8,
+    "stack_blocks_two": 8,
+    "stack_bowls_three": 8,
+    "stack_bowls_two": 8,
+    "stamp_seal": 8,
+    "turn_switch": 8,
+
+    # real-world
+    "real_grasp_coke": 9,
+    "real_pick_up_cup_in_middle": 9,
+    "real_stack_cups": 9,
+    "real_put_apple_on_tray_and_then_put_banana_on_tray": 9,
+    "realworld_tasks_all": 9,
+    "realworld_4tasks": 9,
+}
\ No newline at end of file
diff --git a/code/dataloader_bak/gr00t_lerobot/mixtures.py b/code/dataloader_bak/gr00t_lerobot/mixtures.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb31a4f0fc67a7bb227fe3e706eb01b4d7072351
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/mixtures.py
@@ -0,0 +1,211 @@
+"""
+mixtures.py
+
+Defines a registry of dataset mixtures and weights for the Open-X Embodiment Datasets. Each dataset is associated with
+a float "sampling weight"
+"""
+
+from typing import Dict, List, Tuple
+
+
+# Dataset mixture name mapped to a list of tuples containing:
+## {nakename: [(data_name, sampling_weight, robot_type)] }
+DATASET_NAMED_MIXTURES = {
+
+    "custom_dataset": [
+        ("custom_dataset_name", 1.0, "custom_robot_config"),
+    ],
+    "custom_dataset_2": [
+        ("custom_dataset_name_1", 1.0, "custom_robot_config"),
+        ("custom_dataset_name_2", 1.0, "custom_robot_config"),
+    ],
+
+    "libero_all": [
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),
+                # ("libero_90_no_noops_lerobot", 1.0, "libero_franka"),
+    ],
+    "bridge": [
+        ("bridge_orig_1.0.0_lerobot", 1.0, "oxe_bridge"),
+    ],
+    "bridge_rt_1": [
+        ("bridge_orig_1.0.0_lerobot", 1.0, "oxe_bridge"),
+        ("fractal20220817_data_0.1.0_lerobot", 1.0, "oxe_rt1"),
+    ],
+
+    "demo_sim_pick_place": [
+        ("sim_pick_place", 1.0, "demo_sim_franka_delta_joints"),
+    ],
+
+    "custom_dataset": [
+        ("custom_dataset_name", 1.0, "custom_robot_config"),
+    ],
+    "custom_dataset_2": [
+        ("custom_dataset_name_1", 1.0, "custom_robot_config"),
+        ("custom_dataset_name_2", 1.0, "custom_robot_config"),
+    ],
+
+    "fourier_gr1_unified_1000": [
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPCanToDrawerClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPCupToDrawerClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPMilkToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPPotatoToMicrowaveClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PnPWineToCabinetClose_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToPanSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToPotSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBowlSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToPanSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromPlateToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToPlateSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToPotSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToTieredbasketSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+        ("gr1_unified.PosttrainPnPNovelFromTrayToTieredshelfSplitA_GR1ArmsAndWaistFourierHands_1000", 1.0, "fourier_gr1_arms_waist"),
+    ],
+
+    "BEHAVIOR_challenge": [
+        ("BEHAVIOR_challenge", 1.0, "R1Pro"),
+    ],
+
+
+    "SO101_pick": [
+        ("pick_dataset_name", 1.0, "SO101"),
+    ],
+
+    "arx_x5": [
+        ("arx_x5", 1.0, "arx_x5"),
+    ],
+
+    "robotwin": [
+        ("adjust_bottle", 1.0, "robotwin"),
+        ("beat_block_hammer", 1.0, "robotwin"),
+        ("blocks_ranking_rgb", 1.0, "robotwin"),
+        ("blocks_ranking_size", 1.0, "robotwin"),
+        ("click_alarmclock", 1.0, "robotwin"),
+        ("click_bell", 1.0, "robotwin"),
+        ("dump_bin_bigbin", 1.0, "robotwin"),
+        ("grab_roller", 1.0, "robotwin"),
+        ("handover_block", 1.0, "robotwin"),
+        ("handover_mic", 1.0, "robotwin"),
+        ("hanging_mug", 1.0, "robotwin"),
+        ("lift_pot", 1.0, "robotwin"),
+        ("move_can_pot", 1.0, "robotwin"),
+        ("move_pillbottle_pad", 1.0, "robotwin"),
+        ("move_playingcard_away", 1.0, "robotwin"),
+        ("move_stapler_pad", 1.0, "robotwin"),
+        ("open_laptop", 1.0, "robotwin"),
+        ("open_microwave", 1.0, "robotwin"),
+        ("pick_diverse_bottles", 1.0, "robotwin"),
+        ("pick_dual_bottles", 1.0, "robotwin"),
+        ("place_a2b_left", 1.0, "robotwin"),
+        ("place_a2b_right", 1.0, "robotwin"),
+        ("place_bread_basket", 1.0, "robotwin"),
+        ("place_bread_skillet", 1.0, "robotwin"),
+        ("place_burger_fries", 1.0, "robotwin"),
+        ("place_can_basket", 1.0, "robotwin"),
+        ("place_cans_plasticbox", 1.0, "robotwin"),
+        ("place_container_plate", 1.0, "robotwin"),
+        ("place_dual_shoes", 1.0, "robotwin"),
+        ("place_empty_cup", 1.0, "robotwin"),
+        ("place_fan", 1.0, "robotwin"),
+        ("place_mouse_pad", 1.0, "robotwin"),
+        ("place_object_basket", 1.0, "robotwin"),
+        ("place_object_scale", 1.0, "robotwin"),
+        ("place_object_stand", 1.0, "robotwin"),
+        ("place_phone_stand", 1.0, "robotwin"),
+        ("place_shoe", 1.0, "robotwin"),
+        ("press_stapler", 1.0, "robotwin"),
+        ("put_bottles_dustbin", 1.0, "robotwin"),
+        ("put_object_cabinet", 1.0, "robotwin"),
+        ("rotate_qrcode", 1.0, "robotwin"),
+        ("scan_object", 1.0, "robotwin"),
+        ("shake_bottle", 1.0, "robotwin"),
+        ("shake_bottle_horizontally", 1.0, "robotwin"),
+        ("stack_blocks_three", 1.0, "robotwin"),
+        ("stack_blocks_two", 1.0, "robotwin"),
+        ("stack_bowls_three", 1.0, "robotwin"),
+        ("stack_bowls_two", 1.0, "robotwin"),
+        ("stamp_seal", 1.0, "robotwin"),
+        ("turn_switch", 1.0, "robotwin"),
+    ],
+    "cross_embodiedment_17tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # robotwin - 8 tasks, selected by average trajectory length, 400, 500, 600, 700, 800, 900, 900, 1200
+        ("beat_block_hammer", 1.0, "robotwin"),  # 
+        ("place_shoe", 1.0, "robotwin"),  # 
+        ("dump_bin_bigbin", 1.0, "robotwin"),  # 
+        ("put_object_cabinet", 1.0, "robotwin"),  # 
+        ("stack_blocks_two", 1.0, "robotwin"),  # 
+        ("stack_bowls_two", 1.0, "robotwin"),  # 
+        ("shake_bottle", 1.0, "robotwin"),  # 
+        ("hanging_mug", 1.0, "robotwin"),  # 
+        # ("blocks_ranking_rgb", 1.0, "robotwin"),  # 
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+    ],
+    "cross_embodiedment_21tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # robotwin - 8 tasks, selected by average trajectory length, 400, 500, 600, 700, 800, 900, 900, 1200
+        ("beat_block_hammer", 1.0, "robotwin"),  # 
+        ("place_shoe", 1.0, "robotwin"),  # 
+        ("dump_bin_bigbin", 1.0, "robotwin"),  # 
+        ("put_object_cabinet", 1.0, "robotwin"),  # 
+        ("stack_blocks_two", 1.0, "robotwin"),  # 
+        ("stack_bowls_two", 1.0, "robotwin"),  # 
+        ("shake_bottle", 1.0, "robotwin"),  # 
+        ("hanging_mug", 1.0, "robotwin"),  # 
+        # ("blocks_ranking_rgb", 1.0, "robotwin"),  # 
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+        # real-world - 4 tasks
+        ("realworld_4tasks", 1.0, "real_world_franka"),
+    ],
+    "real_world_4tasks": [
+        ("realworld_4tasks", 1.0, "real_world_franka"),
+    ],
+    "realworld_tasks_all": [
+        ("realworld_tasks_all", 1.0, "real_world_franka"),
+    ],
+    "cross_embodiedment_13tasks": [
+        # libero - 4 tasks
+        ("libero_object_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 66984
+        ("libero_goal_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52042
+        ("libero_spatial_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 52970
+        ("libero_10_no_noops_1.0.0_lerobot", 1.0, "libero_franka"),  # 101469
+        # gr1 - 5 tasks
+        ("gr1_unified.PnPBottleToCabinetClose_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 71341
+        ("gr1_unified.PosttrainPnPNovelFromCuttingboardToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48282
+        ("gr1_unified.PosttrainPnPNovelFromPlacematToBasketSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 48066
+        ("gr1_unified.PosttrainPnPNovelFromPlateToBowlSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 41518
+        ("gr1_unified.PosttrainPnPNovelFromTrayToCardboardboxSplitA_GR1ArmsAndWaistFourierHands_200", 1.0, "fourier_gr1_arms_waist"),  # 39739
+        # real-world - 4 tasks
+        ("realworld_4tasks", 1.0, "real_world_franka"),
+    ],
+}
diff --git a/code/dataloader_bak/gr00t_lerobot/schema.py b/code/dataloader_bak/gr00t_lerobot/schema.py
new file mode 100644
index 0000000000000000000000000000000000000000..64519e56a3c59e5d08c8f8f6370f640061859b4d
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/schema.py
@@ -0,0 +1,221 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum
+from typing import Optional
+
+from numpydantic import NDArray
+from pydantic import BaseModel, Field, field_serializer
+
+from .embodiment_tags import EmbodimentTag
+
+# Common schema
+
+
+class RotationType(Enum):
+    """Type of rotation representation"""
+
+    AXIS_ANGLE = "axis_angle"
+    QUATERNION = "quaternion"
+    ROTATION_6D = "rotation_6d"
+    MATRIX = "matrix"
+    EULER_ANGLES_RPY = "euler_angles_rpy"
+    EULER_ANGLES_RYP = "euler_angles_ryp"
+    EULER_ANGLES_PRY = "euler_angles_pry"
+    EULER_ANGLES_PYR = "euler_angles_pyr"
+    EULER_ANGLES_YRP = "euler_angles_yrp"
+    EULER_ANGLES_YPR = "euler_angles_ypr"
+
+
+# LeRobot schema
+
+
+class LeRobotModalityField(BaseModel):
+    """Metadata for a LeRobot modality field."""
+
+    original_key: Optional[str] = Field(
+        default=None,
+        description="The original key of the modality in the LeRobot dataset",
+    )
+
+
+class LeRobotStateActionMetadata(LeRobotModalityField):
+    """Metadata for a LeRobot modality."""
+
+    start: int = Field(
+        ...,
+        description="The start index of the modality in the concatenated state/action vector",
+    )
+    end: int = Field(
+        ...,
+        description="The end index of the modality in the concatenated state/action vector",
+    )
+    rotation_type: Optional[RotationType] = Field(
+        default=None, description="The type of rotation for the modality"
+    )
+    absolute: bool = Field(default=True, description="Whether the modality is absolute")
+    dtype: str = Field(
+        default="float64",
+        description="The data type of the modality. Defaults to float64.",
+    )
+    range: Optional[tuple[float, float]] = Field(
+        default=None,
+        description="The range of the modality, if applicable. Defaults to None.",
+    )
+    original_key: Optional[str] = Field(
+        default=None,
+        description="The original key of the modality in the LeRobot dataset.",
+    )
+
+
+class LeRobotStateMetadata(LeRobotStateActionMetadata):
+    """Metadata for a LeRobot state modality."""
+
+    original_key: Optional[str] = Field(
+        default="observation.state",  # LeRobot convention for states
+        description="The original key of the state modality in the LeRobot dataset",
+    )
+
+
+class LeRobotActionMetadata(LeRobotStateActionMetadata):
+    """Metadata for a LeRobot action modality."""
+
+    original_key: Optional[str] = Field(
+        default="action",  # LeRobot convention for actions
+        description="The original key of the action modality in the LeRobot dataset",
+    )
+
+
+class LeRobotModalityMetadata(BaseModel):
+    """Metadata for a LeRobot modality."""
+
+    state: dict[str, LeRobotStateMetadata] = Field(
+        ...,
+        description="The metadata for the state modality. The keys are the names of each split of the state vector.",
+    )
+    action: dict[str, LeRobotActionMetadata] = Field(
+        ...,
+        description="The metadata for the action modality. The keys are the names of each split of the action vector.",
+    )
+    video: dict[str, LeRobotModalityField] = Field(
+        ...,
+        description="The metadata for the video modality. The keys are the new names of each video modality.",
+    )
+    annotation: Optional[dict[str, LeRobotModalityField]] = Field(
+        default=None,
+        description="The metadata for the annotation modality. The keys are the new names of each annotation modality.",
+    )
+
+    def get_key_meta(self, key: str) -> LeRobotModalityField:
+        """Get the metadata for a key in the LeRobot modality metadata.
+
+        Args:
+            key (str): The key to get the metadata for.
+
+        Returns:
+            LeRobotModalityField: The metadata for the key.
+
+        Example:
+            lerobot_modality_meta = LeRobotModalityMetadata.model_validate(U.load_json(modality_meta_path))
+            lerobot_modality_meta.get_key_meta("state.joint_shoulder_y")
+            lerobot_modality_meta.get_key_meta("video.main_camera")
+            lerobot_modality_meta.get_key_meta("annotation.human.action.task_description")
+        """
+        split_key = key.split(".")
+        modality = split_key[0]
+        subkey = ".".join(split_key[1:])
+        if modality == "state":
+            if subkey not in self.state:
+                raise ValueError(
+                    f"Key: {key}, state key {subkey} not found in metadata, available state keys: {self.state.keys()}"
+                )
+            return self.state[subkey]
+        elif modality == "action":
+            if subkey not in self.action:
+                raise ValueError(
+                    f"Key: {key}, action key {subkey} not found in metadata, available action keys: {self.action.keys()}"
+                )
+            return self.action[subkey]
+        elif modality == "video":
+            if subkey not in self.video:
+                raise ValueError(
+                    f"Key: {key}, video key {subkey} not found in metadata, available video keys: {self.video.keys()}"
+                )
+            return self.video[subkey]
+        elif modality == "annotation":
+            assert (
+                self.annotation is not None
+            ), "Trying to get annotation metadata for a dataset with no annotations"
+            if subkey not in self.annotation:
+                raise ValueError(
+                    f"Key: {key}, annotation key {subkey} not found in metadata, available annotation keys: {self.annotation.keys()}"
+                )
+            return self.annotation[subkey]
+        else:
+            raise ValueError(f"Key: {key}, unexpected modality: {modality}")
+
+
+# Dataset schema (parsed from LeRobot schema and simplified)
+
+
+class DatasetStatisticalValues(BaseModel):
+    max: NDArray = Field(..., description="Maximum values")
+    min: NDArray = Field(..., description="Minimum values")
+    mean: NDArray = Field(..., description="Mean values")
+    std: NDArray = Field(..., description="Standard deviation")
+    q01: NDArray = Field(..., description="1st percentile values")
+    q99: NDArray = Field(..., description="99th percentile values")
+
+    @field_serializer("*", when_used="json")
+    def serialize_ndarray(self, v: NDArray) -> list[float]:
+        return v.tolist()  # type: ignore
+
+
+class DatasetStatistics(BaseModel):
+    state: dict[str, DatasetStatisticalValues] = Field(..., description="Statistics of the state")
+    action: dict[str, DatasetStatisticalValues] = Field(..., description="Statistics of the action")
+
+
+class VideoMetadata(BaseModel):
+    """Metadata of the video modality"""
+
+    resolution: tuple[int, int] = Field(..., description="Resolution of the video")
+    channels: int = Field(..., description="Number of channels in the video", gt=0)
+    fps: float = Field(..., description="Frames per second", gt=0)
+
+
+class StateActionMetadata(BaseModel):
+    absolute: bool = Field(..., description="Whether the state or action is absolute")
+    rotation_type: Optional[RotationType] = Field(None, description="Type of rotation, if any")
+    shape: tuple[int, ...] = Field(..., description="Shape of the state or action")
+    continuous: bool = Field(..., description="Whether the state or action is continuous")
+
+
+class DatasetModalities(BaseModel):
+    video: dict[str, VideoMetadata] = Field(..., description="Metadata of the video")
+    state: dict[str, StateActionMetadata] = Field(..., description="Metadata of the state")
+    action: dict[str, StateActionMetadata] = Field(..., description="Metadata of the action")
+
+
+class DatasetMetadata(BaseModel):
+    """Metadata of the trainable dataset
+
+    Changes:
+        - Update to use the new RawCommitHashMetadataMetadata_V1_2
+    """
+
+    statistics: DatasetStatistics = Field(..., description="Statistics of the dataset")
+    modalities: DatasetModalities = Field(..., description="Metadata of the modalities")
+    embodiment_tag: EmbodimentTag = Field(..., description="Embodiment tag of the dataset")
\ No newline at end of file
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__init__.py b/code/dataloader_bak/gr00t_lerobot/transform/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf22b5e4ca3d1c7937a25234cbf08e2644593587
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/transform/__init__.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .base import (
+    ComposedModalityTransform,
+    InvertibleModalityTransform,
+    ModalityTransform,
+)
+from .concat import ConcatTransform
+# from .state_action import (
+#     StateActionDropout,
+#     StateActionPerturbation,
+#     StateActionSinCosTransform,
+#     StateActionToTensor,
+#     StateActionTransform,
+# )
+from .video import (
+    VideoColorJitter,
+    VideoCrop,
+    VideoGrayscale,
+    VideoHorizontalFlip,
+    VideoRandomGrayscale,
+    VideoRandomPosterize,
+    VideoRandomRotation,
+    VideoResize,
+    VideoToNumpy,
+    VideoToTensor,
+    VideoTransform,
+)
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bb6b78fd530deecd32d433f87df53df03795d79
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a3c6c6e0302d9bd17d0291b7e3d7e22b7f85d2b
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/base.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c029431c2bd59ddcff7e95c263646b6bb97023f
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/concat.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..434e5b2d121084471d1fc7868a168978869527cd
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/state_action.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eebb8c0e5de56c6f6f1301c506e368ca5986ff9
Binary files /dev/null and b/code/dataloader_bak/gr00t_lerobot/transform/__pycache__/video.cpython-310.pyc differ
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/base.py b/code/dataloader_bak/gr00t_lerobot/transform/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..aac88559af98fa23f34fbb9135775d0819c281ef
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/transform/base.py
@@ -0,0 +1,134 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from pydantic import BaseModel, ConfigDict, Field, PrivateAttr
+
+from ..schema import DatasetMetadata
+
+
+class ModalityTransform(BaseModel, ABC):
+    """
+    Abstract class for transforming data modalities, e.g. video frame augmentation or action normalization.
+    """
+
+    apply_to: list[str] = Field(..., description="The keys to apply the transform to.")
+    training: bool = Field(
+        default=True, description="Whether to apply the transform in training mode."
+    )
+    _dataset_metadata: DatasetMetadata | None = PrivateAttr(default=None)
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    @property
+    def dataset_metadata(self) -> DatasetMetadata:
+        assert (
+            self._dataset_metadata is not None
+        ), "Dataset metadata is not set. Please call set_metadata() before calling apply()."
+        return self._dataset_metadata
+
+    @dataset_metadata.setter
+    def dataset_metadata(self, value: DatasetMetadata):
+        self._dataset_metadata = value
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        """
+        Set the dataset metadata. This is useful for transforms that need to know the dataset metadata, e.g. to normalize actions.
+        Subclasses can override this method if they need to do something more complex.
+        """
+        self.dataset_metadata = dataset_metadata
+
+    def __call__(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Apply the transformation to the data corresponding to target_keys and return the processed data.
+
+        Args:
+            data (dict[str, Any]): The data to transform.
+                example: data = {
+                    "video.image_side_0": np.ndarray,
+                    "action.eef_position": np.ndarray,
+                    ...
+                }
+
+        Returns:
+            dict[str, Any]: The transformed data.
+                example: transformed_data = {
+                    "video.image_side_0": np.ndarray,
+                    "action.eef_position": torch.Tensor,  # Normalized and converted to tensor
+                    ...
+                }
+        """
+        return self.apply(data)
+
+    @abstractmethod
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Apply the transformation to the data corresponding to keys matching the `apply_to` regular expression and return the processed data."""
+
+    def train(self):
+        self.training = True
+
+    def eval(self):
+        self.training = False
+
+
+class InvertibleModalityTransform(ModalityTransform):
+    @abstractmethod
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        """Reverse the transformation to the data corresponding to keys matching the `apply_to` regular expression and return the processed data."""
+
+
+class ComposedModalityTransform(ModalityTransform):
+    """Compose multiple modality transforms."""
+
+    transforms: list[ModalityTransform] = Field(..., description="The transforms to compose.")
+    apply_to: list[str] = Field(
+        default_factory=list, description="Will be ignored for composed transforms."
+    )
+    training: bool = Field(
+        default=True, description="Whether to apply the transform in training mode."
+    )
+
+    model_config = ConfigDict(arbitrary_types_allowed=True, from_attributes=True)
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        for transform in self.transforms:
+            transform.set_metadata(dataset_metadata)
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for i, transform in enumerate(self.transforms):
+            try:
+                data = transform(data)
+            except Exception as e:
+                raise ValueError(f"Error applying transform {i} to data: {e}") from e
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for i, transform in enumerate(reversed(self.transforms)):
+            if isinstance(transform, InvertibleModalityTransform):
+                try:
+                    data = transform.unapply(data)
+                except Exception as e:
+                    step = len(self.transforms) - i - 1
+                    raise ValueError(f"Error unapplying transform {step} to data: {e}") from e
+        return data
+
+    def train(self):
+        for transform in self.transforms:
+            transform.train()
+
+    def eval(self):
+        for transform in self.transforms:
+            transform.eval()
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/concat.py b/code/dataloader_bak/gr00t_lerobot/transform/concat.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf8eea4c77fc163ecdb0d25aeca26a2cde99f8c4
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/transform/concat.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import numpy as np
+import torch
+from pydantic import Field
+
+from ..schema import DatasetMetadata, StateActionMetadata
+from .base import InvertibleModalityTransform
+
+
+class ConcatTransform(InvertibleModalityTransform):
+    """
+    Concatenate the keys according to specified order.
+    """
+
+    # -- We inherit from ModalityTransform, so we keep apply_to as well --
+    apply_to: list[str] = Field(
+        default_factory=list, description="Not used in this transform, kept for compatibility."
+    )
+
+    video_concat_order: list[str] = Field(
+        ...,
+        description="Concatenation order for each video modality. "
+        "Format: ['video.ego_view_pad_res224_freq20', ...]",
+    )
+
+    state_concat_order: Optional[list[str]] = Field(
+        default=None,
+        description="Concatenation order for each state modality. "
+        "Format: ['state.position', 'state.velocity', ...].",
+    )
+
+    action_concat_order: Optional[list[str]] = Field(
+        default=None,
+        description="Concatenation order for each action modality. "
+        "Format: ['action.position', 'action.velocity', ...].",
+    )
+
+    action_dims: dict[str, int] = Field(
+        default_factory=dict,
+        description="The dimensions of the action keys.",
+    )
+    state_dims: dict[str, int] = Field(
+        default_factory=dict,
+        description="The dimensions of the state keys.",
+    )
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {
+                "apply_to",
+                "video_concat_order",
+                "state_concat_order",
+                "action_concat_order",
+            }
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    def apply(self, data: dict) -> dict:
+        grouped_keys = {}
+        for key in data.keys():
+            try:
+                modality, _ = key.split(".")
+            except:  # noqa: E722
+                ### Handle language annotation special case
+                if "annotation" in key:
+                    modality = "language"
+                else:
+                    modality = "others"
+            if modality not in grouped_keys:
+                grouped_keys[modality] = []
+            grouped_keys[modality].append(key)
+
+        if "video" in grouped_keys:
+            # Check if keys in video_concat_order, state_concat_order, action_concat_order are
+            # ineed contained in the data. If not, then the keys are misspecified
+            video_keys = grouped_keys["video"]
+            assert self.video_concat_order is not None, f"{self.video_concat_order=}, {video_keys=}"
+            assert all(
+                item in video_keys for item in self.video_concat_order
+            ), f"keys in video_concat_order are misspecified, \n{video_keys=}, \n{self.video_concat_order=}"
+
+            # Process each video view
+            unsqueezed_videos = []
+            for video_key in self.video_concat_order:
+                video_data = data.pop(video_key)
+                unsqueezed_video = np.expand_dims(
+                    video_data, axis=-4
+                )  # [..., H, W, C] -> [..., 1, H, W, C]
+                unsqueezed_videos.append(unsqueezed_video)
+            # Concatenate along the new axis
+            unsqueezed_video = np.concatenate(unsqueezed_videos, axis=-4)  # [..., V, H, W, C]
+
+            # Video
+            data["video"] = unsqueezed_video
+
+        # "state"
+        if "state" in grouped_keys:
+            state_keys = grouped_keys["state"]
+            assert self.state_concat_order is not None, f"{self.state_concat_order=}"
+            assert all(
+                item in state_keys for item in self.state_concat_order
+            ), f"keys in state_concat_order are misspecified, \n{state_keys=}, \n{self.state_concat_order=}"
+            # Check the state dims
+            for key in self.state_concat_order:
+                target_shapes = [self.state_dims[key]]
+                if self.is_rotation_key(key):
+                    target_shapes.append(6)  # Allow for rotation_6d
+                # if key in ["state.right_arm", "state.right_hand"]:
+                target_shapes.append(self.state_dims[key] * 2)  # Allow for sin-cos transform
+                assert (
+                    data[key].shape[-1] in target_shapes
+                ), f"State dim mismatch for {key=}, {data[key].shape[-1]=}, {target_shapes=}"
+            # Concatenate the state keys
+            # We'll have StateActionToTensor before this transform, so here we use torch.cat
+            data["state"] = torch.cat(
+                [data.pop(key) for key in self.state_concat_order], dim=-1
+            )  # [T, D_state]
+
+        if "action" in grouped_keys:
+            action_keys = grouped_keys["action"]
+            assert self.action_concat_order is not None, f"{self.action_concat_order=}"
+            # Check if all keys in concat_order are present
+            assert set(self.action_concat_order) == set(
+                action_keys
+            ), f"{set(self.action_concat_order)=}, {set(action_keys)=}"
+            # Record the action dims
+            for key in self.action_concat_order:
+                target_shapes = [self.action_dims[key]]
+                if self.is_rotation_key(key):
+                    target_shapes.append(3)  # Allow for axis angle
+                assert (
+                    self.action_dims[key] == data[key].shape[-1]
+                ), f"Action dim mismatch for {key=}, {self.action_dims[key]=}, {data[key].shape[-1]=}"
+            # Concatenate the action keys
+            # We'll have StateActionToTensor before this transform, so here we use torch.cat
+            data["action"] = torch.cat(
+                [data.pop(key) for key in self.action_concat_order], dim=-1
+            )  # [T, D_action]
+
+        return data
+
+    def unapply(self, data: dict) -> dict:
+        start_dim = 0
+        assert "action" in data, f"{data.keys()=}"
+        # For those dataset without actions (LAPA), we'll never run unapply
+        assert self.action_concat_order is not None, f"{self.action_concat_order=}"
+        action_tensor = data.pop("action")
+        for key in self.action_concat_order:
+            if key not in self.action_dims:
+                raise ValueError(f"Action dim {key} not found in action_dims.")
+            end_dim = start_dim + self.action_dims[key]
+            data[key] = action_tensor[..., start_dim:end_dim]
+            start_dim = end_dim
+        if "state" in data:
+            assert self.state_concat_order is not None, f"{self.state_concat_order=}"
+            start_dim = 0
+            state_tensor = data.pop("state")
+            for key in self.state_concat_order:
+                end_dim = start_dim + self.state_dims[key]
+                data[key] = state_tensor[..., start_dim:end_dim]
+                start_dim = end_dim
+        return data
+
+    def __call__(self, data: dict) -> dict:
+        return self.apply(data)
+
+    def get_modality_metadata(self, key: str) -> StateActionMetadata:
+        modality, subkey = key.split(".")
+        assert self.dataset_metadata is not None, "Metadata not set"
+        modality_config = getattr(self.dataset_metadata.modalities, modality)
+        assert subkey in modality_config, f"{subkey=} not found in {modality_config=}"
+        assert isinstance(
+            modality_config[subkey], StateActionMetadata
+        ), f"Expected {StateActionMetadata} for {subkey=}, got {type(modality_config[subkey])=}"
+        return modality_config[subkey]
+
+    def get_state_action_dims(self, key: str) -> int:
+        """Get the dimension of a state or action key from the dataset metadata."""
+        modality_config = self.get_modality_metadata(key)
+        shape = modality_config.shape
+        assert len(shape) == 1, f"{shape=}"
+        return shape[0]
+
+    def is_rotation_key(self, key: str) -> bool:
+        modality_config = self.get_modality_metadata(key)
+        return modality_config.rotation_type is not None
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        """Set the metadata and compute the dimensions of the state and action keys."""
+        super().set_metadata(dataset_metadata)
+        # Pre-compute the dimensions of the state and action keys
+        if self.action_concat_order is not None:
+            for key in self.action_concat_order:
+                self.action_dims[key] = self.get_state_action_dims(key)
+        if self.state_concat_order is not None:
+            for key in self.state_concat_order:
+                self.state_dims[key] = self.get_state_action_dims(key)
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/state_action.py b/code/dataloader_bak/gr00t_lerobot/transform/state_action.py
new file mode 100644
index 0000000000000000000000000000000000000000..a01d5f7c39903e3e78f4d92e6f901d93a99707e1
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/transform/state_action.py
@@ -0,0 +1,606 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import random
+from typing import Any, ClassVar
+
+import numpy as np
+import pytorch3d.transforms as pt
+import torch
+from pydantic import Field, PrivateAttr, field_validator, model_validator
+
+from ..schema import DatasetMetadata, RotationType, StateActionMetadata
+from .base import InvertibleModalityTransform, ModalityTransform
+
+
+class RotationTransform:
+    """Adapted from https://github.com/real-stanford/diffusion_policy/blob/548a52bbb105518058e27bf34dcf90bf6f73681a/diffusion_policy/model/common/rotation_transformer.py"""
+
+    valid_reps = ["axis_angle", "euler_angles", "quaternion", "rotation_6d", "matrix"]
+
+    def __init__(self, from_rep="axis_angle", to_rep="rotation_6d"):
+        """
+        Valid representations
+
+        Always use matrix as intermediate representation.
+        """
+        if from_rep.startswith("euler_angles"):
+            from_convention = from_rep.split("_")[-1]
+            from_rep = "euler_angles"
+            from_convention = from_convention.replace("r", "X").replace("p", "Y").replace("y", "Z")
+        else:
+            from_convention = None
+        if to_rep.startswith("euler_angles"):
+            to_convention = to_rep.split("_")[-1]
+            to_rep = "euler_angles"
+            to_convention = to_convention.replace("r", "X").replace("p", "Y").replace("y", "Z")
+        else:
+            to_convention = None
+        assert from_rep != to_rep, f"from_rep and to_rep cannot be the same: {from_rep}"
+        assert from_rep in self.valid_reps, f"Invalid from_rep: {from_rep}"
+        assert to_rep in self.valid_reps, f"Invalid to_rep: {to_rep}"
+
+        forward_funcs = list()
+        inverse_funcs = list()
+
+        if from_rep != "matrix":
+            funcs = [getattr(pt, f"{from_rep}_to_matrix"), getattr(pt, f"matrix_to_{from_rep}")]
+            if from_convention is not None:
+                funcs = [functools.partial(func, convention=from_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        if to_rep != "matrix":
+            funcs = [getattr(pt, f"matrix_to_{to_rep}"), getattr(pt, f"{to_rep}_to_matrix")]
+            if to_convention is not None:
+                funcs = [functools.partial(func, convention=to_convention) for func in funcs]
+            forward_funcs.append(funcs[0])
+            inverse_funcs.append(funcs[1])
+
+        inverse_funcs = inverse_funcs[::-1]
+
+        self.forward_funcs = forward_funcs
+        self.inverse_funcs = inverse_funcs
+
+    @staticmethod
+    def _apply_funcs(x: torch.Tensor, funcs: list) -> torch.Tensor:
+        assert isinstance(x, torch.Tensor)
+        for func in funcs:
+            x = func(x)
+        return x
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        return self._apply_funcs(x, self.forward_funcs)
+
+    def inverse(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        return self._apply_funcs(x, self.inverse_funcs)
+
+
+class Normalizer:
+    valid_modes = ["q99", "mean_std", "min_max", "binary"]
+
+    def __init__(self, mode: str, statistics: dict):
+        self.mode = mode
+        self.statistics = statistics
+        for key, value in self.statistics.items():
+            self.statistics[key] = torch.tensor(value)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+
+        # Normalize the tensor
+        if self.mode == "q99":
+            # Range of q99 is [-1, 1]
+            q01 = self.statistics["q01"].to(x.dtype)
+            q99 = self.statistics["q99"].to(x.dtype)
+
+            # In the case of q01 == q99, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = q01 != q99
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where q01 != q99
+            # Formula: 2 * (x - q01) / (q99 - q01) - 1
+            normalized[..., mask] = (x[..., mask] - q01[..., mask]) / (
+                q99[..., mask] - q01[..., mask]
+            )
+            normalized[..., mask] = 2 * normalized[..., mask] - 1
+
+            # Set the normalized values to the original values where q01 == q99
+            normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+
+            # Clip the normalized values to be between -1 and 1
+            normalized = torch.clamp(normalized, -1, 1)
+
+        elif self.mode == "mean_std":
+            # Range of mean_std is not fixed, but can be positive or negative
+            mean = self.statistics["mean"].to(x.dtype)
+            std = self.statistics["std"].to(x.dtype)
+
+            # In the case of std == 0, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = std != 0
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where std != 0
+            # Formula: (x - mean) / std
+            normalized[..., mask] = (x[..., mask] - mean[..., mask]) / std[..., mask]
+
+            # Set the normalized values to the original values where std == 0
+            normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+
+        elif self.mode == "min_max":
+            # Range of min_max is [-1, 1]
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+
+            # In the case of min == max, the normalization will be undefined
+            # So we set the normalized values to the original values
+            mask = min != max
+            normalized = torch.zeros_like(x)
+
+            # Normalize the values where min != max
+            # Formula: 2 * (x - min) / (max - min) - 1
+            normalized[..., mask] = (x[..., mask] - min[..., mask]) / (
+                max[..., mask] - min[..., mask]
+            )
+            normalized[..., mask] = 2 * normalized[..., mask] - 1
+
+            # Set the normalized values to the original values where min == max
+            # normalized[..., ~mask] = x[..., ~mask].to(x.dtype)
+            # Set the normalized values to 0 where min == max
+            normalized[..., ~mask] = 0
+
+        elif self.mode == "scale":
+            # Range of scale is [0, 1]
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+            abs_max = torch.max(torch.abs(min), torch.abs(max))
+            mask = abs_max != 0
+            normalized = torch.zeros_like(x)
+            normalized[..., mask] = x[..., mask] / abs_max[..., mask]
+            normalized[..., ~mask] = 0
+
+        elif self.mode == "binary":
+            # Range of binary is [0, 1]
+            normalized = (x > 0.5).to(x.dtype)
+        else:
+            raise ValueError(f"Invalid normalization mode: {self.mode}")
+
+        return normalized
+
+    def inverse(self, x: torch.Tensor) -> torch.Tensor:
+        assert isinstance(
+            x, torch.Tensor
+        ), f"Unexpected input type: {type(x)}. Expected type: {torch.Tensor}"
+        if self.mode == "q99":
+            q01 = self.statistics["q01"].to(x.dtype)
+            q99 = self.statistics["q99"].to(x.dtype)
+            return (x + 1) / 2 * (q99 - q01) + q01
+        elif self.mode == "mean_std":
+            mean = self.statistics["mean"].to(x.dtype)
+            std = self.statistics["std"].to(x.dtype)
+            return x * std + mean
+        elif self.mode == "min_max":
+            min = self.statistics["min"].to(x.dtype)
+            max = self.statistics["max"].to(x.dtype)
+            return (x + 1) / 2 * (max - min) + min
+        elif self.mode == "binary":
+            return (x > 0.5).to(x.dtype)
+        else:
+            raise ValueError(f"Invalid normalization mode: {self.mode}")
+
+
+class StateActionToTensor(InvertibleModalityTransform):
+    """
+    Transforms states and actions to tensors.
+    """
+
+    input_dtypes: dict[str, np.dtype] = Field(
+        default_factory=dict, description="The input dtypes for each state key."
+    )
+    output_dtypes: dict[str, torch.dtype] = Field(
+        default_factory=dict, description="The output dtypes for each state key."
+    )
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {"apply_to"}
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    @field_validator("input_dtypes", "output_dtypes", mode="before")
+    def validate_dtypes(cls, v):
+        for key, dtype in v.items():
+            if isinstance(dtype, str):
+                if dtype.startswith("torch."):
+                    dtype_split = dtype.split(".")[-1]
+                    v[key] = getattr(torch, dtype_split)
+                elif dtype.startswith("np.") or dtype.startswith("numpy."):
+                    dtype_split = dtype.split(".")[-1]
+                    v[key] = np.dtype(dtype_split)
+                else:
+                    raise ValueError(f"Invalid dtype: {dtype}")
+        return v
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            value = data[key]
+            assert isinstance(
+                value, np.ndarray
+            ), f"Unexpected input type: {type(value)}. Expected type: {np.ndarray}"
+            data[key] = torch.from_numpy(value)
+            if key in self.output_dtypes:
+                data[key] = data[key].to(self.output_dtypes[key])
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            value = data[key]
+            assert isinstance(
+                value, torch.Tensor
+            ), f"Unexpected input type: {type(value)}. Expected type: {torch.Tensor}"
+            data[key] = value.numpy()
+            if key in self.input_dtypes:
+                data[key] = data[key].astype(self.input_dtypes[key])
+        return data
+
+
+class StateActionTransform(InvertibleModalityTransform):
+    """
+    Class for state or action transform.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        normalization_modes (dict[str, str]): The normalization modes for each state key.
+            If a state key in apply_to is not present in the dictionary, it will not be normalized.
+        target_rotations (dict[str, str]): The target representations for each state key.
+            If a state key in apply_to is not present in the dictionary, it will not be rotated.
+    """
+
+    # Configurable attributes
+    apply_to: list[str] = Field(..., description="The keys in the modality to load and transform.")
+    normalization_modes: dict[str, str] = Field(
+        default_factory=dict, description="The normalization modes for each state key."
+    )
+    target_rotations: dict[str, str] = Field(
+        default_factory=dict, description="The target representations for each state key."
+    )
+    normalization_statistics: dict[str, dict] = Field(
+        default_factory=dict, description="The statistics for each state key."
+    )
+    modality_metadata: dict[str, StateActionMetadata] = Field(
+        default_factory=dict, description="The modality metadata for each state key."
+    )
+
+    # Model variables
+    _rotation_transformers: dict[str, RotationTransform] = PrivateAttr(default_factory=dict)
+    _normalizers: dict[str, Normalizer] = PrivateAttr(default_factory=dict)
+    _input_dtypes: dict[str, np.dtype | torch.dtype] = PrivateAttr(default_factory=dict)
+
+    # Model constants
+    _DEFAULT_MIN_MAX_STATISTICS: ClassVar[dict] = {
+        "rotation_6d": {
+            "min": [-1, -1, -1, -1, -1, -1],
+            "max": [1, 1, 1, 1, 1, 1],
+        },
+        "euler_angles": {
+            "min": [-np.pi, -np.pi, -np.pi],
+            "max": [np.pi, np.pi, np.pi],
+        },
+        "quaternion": {
+            "min": [-1, -1, -1, -1],
+            "max": [1, 1, 1, 1],
+        },
+        "axis_angle": {
+            "min": [-np.pi, -np.pi, -np.pi],
+            "max": [np.pi, np.pi, np.pi],
+        },
+    }
+
+    def model_dump(self, *args, **kwargs):
+        if kwargs.get("mode", "python") == "json":
+            include = {"apply_to", "normalization_modes", "target_rotations"}
+        else:
+            include = kwargs.pop("include", None)
+
+        return super().model_dump(*args, include=include, **kwargs)
+
+    @field_validator("modality_metadata", mode="before")
+    def validate_modality_metadata(cls, v):
+        for modality_key, config in v.items():
+            if isinstance(config, dict):
+                config = StateActionMetadata.model_validate(config)
+            else:
+                assert isinstance(
+                    config, StateActionMetadata
+                ), f"Invalid source rotation config: {config}"
+            v[modality_key] = config
+        return v
+
+    @model_validator(mode="after")
+    def validate_normalization_statistics(self):
+        for modality_key, normalization_statistics in self.normalization_statistics.items():
+            if modality_key in self.normalization_modes:
+                normalization_mode = self.normalization_modes[modality_key]
+                if normalization_mode == "min_max":
+                    assert (
+                        "min" in normalization_statistics and "max" in normalization_statistics
+                    ), f"Min and max statistics are required for min_max normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["min"]) == len(
+                        normalization_statistics["max"]
+                    ), f"Min and max statistics must have the same length, but got {normalization_statistics['min']} and {normalization_statistics['max']}"
+                elif normalization_mode == "mean_std":
+                    assert (
+                        "mean" in normalization_statistics and "std" in normalization_statistics
+                    ), f"Mean and std statistics are required for mean_std normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["mean"]) == len(
+                        normalization_statistics["std"]
+                    ), f"Mean and std statistics must have the same length, but got {normalization_statistics['mean']} and {normalization_statistics['std']}"
+                elif normalization_mode == "q99":
+                    assert (
+                        "q01" in normalization_statistics and "q99" in normalization_statistics
+                    ), f"q01 and q99 statistics are required for q99 normalization, but got {normalization_statistics}"
+                    assert len(normalization_statistics["q01"]) == len(
+                        normalization_statistics["q99"]
+                    ), f"q01 and q99 statistics must have the same length, but got {normalization_statistics['q01']} and {normalization_statistics['q99']}"
+                elif normalization_mode == "binary":
+                    assert (
+                        len(normalization_statistics) == 1
+                    ), f"Binary normalization should only have one value, but got {normalization_statistics}"
+                    assert normalization_statistics[0] in [
+                        0,
+                        1,
+                    ], f"Binary normalization should only have 0 or 1, but got {normalization_statistics[0]}"
+                else:
+                    raise ValueError(f"Invalid normalization mode: {normalization_mode}")
+        return self
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        dataset_statistics = dataset_metadata.statistics
+        modality_metadata = dataset_metadata.modalities
+
+        # Check that all state keys specified in apply_to have their modality_metadata
+        for key in self.apply_to:
+            split_key = key.split(".", 1)
+            assert len(split_key) == 2, "State keys should have two parts: 'modality.key'"
+            if key not in self.modality_metadata:
+                modality, state_key = split_key
+                assert hasattr(modality_metadata, modality), f"{modality} config not found"
+                assert state_key in getattr(
+                    modality_metadata, modality
+                ), f"{state_key} config not found"
+                self.modality_metadata[key] = getattr(modality_metadata, modality)[state_key]
+
+        # Check that all state keys specified in normalization_modes have their statistics in state_statistics
+        for key in self.normalization_modes:
+            split_key = key.split(".", 1)
+            assert len(split_key) == 2, "State keys should have two parts: 'modality.key'"
+            modality, state_key = split_key
+            assert hasattr(dataset_statistics, modality), f"{modality} statistics not found"
+            assert state_key in getattr(
+                dataset_statistics, modality
+            ), f"{state_key} statistics not found"
+            assert (
+                len(getattr(modality_metadata, modality)[state_key].shape) == 1
+            ), f"{getattr(modality_metadata, modality)[state_key].shape=}"
+            self.normalization_statistics[key] = getattr(dataset_statistics, modality)[
+                state_key
+            ].model_dump()
+
+        # Initialize the rotation transformers
+        for key in self.target_rotations:
+            # Get the original representation of the state
+            from_rep = self.modality_metadata[key].rotation_type
+            assert from_rep is not None, f"Source rotation type not found for {key}"
+
+            # Get the target representation of the state, will raise an error if the target representation is not valid
+            to_rep = RotationType(self.target_rotations[key])
+
+            # If the original representation is not the same as the target representation, initialize the rotation transformer
+            if from_rep != to_rep:
+                self._rotation_transformers[key] = RotationTransform(
+                    from_rep=from_rep.value, to_rep=to_rep.value
+                )
+
+        # Initialize the normalizers
+        for key in self.normalization_modes:
+            modality, state_key = key.split(".", 1)
+            # If the state has a nontrivial rotation, we need to handle it more carefully
+            # For absolute rotations, we need to convert them to the target representation and normalize them using min_max mode,
+            # since we can infer the bounds by the representation
+            # For relative rotations, we cannot normalize them as we don't know the bounds
+            if key in self._rotation_transformers:
+                # Case 1: Absolute rotation
+                if self.modality_metadata[key].absolute:
+                    # Check that the normalization mode is valid
+                    assert (
+                        self.normalization_modes[key] == "min_max"
+                    ), "Absolute rotations that are converted to other formats must be normalized using `min_max` mode"
+                    rotation_type = RotationType(self.target_rotations[key]).value
+                    # If the target representation is euler angles, we need to parse the convention
+                    if rotation_type.startswith("euler_angles"):
+                        rotation_type = "euler_angles"
+                    # Get the statistics for the target representation
+                    statistics = self._DEFAULT_MIN_MAX_STATISTICS[rotation_type]
+                # Case 2: Relative rotation
+                else:
+                    raise ValueError(
+                        f"Cannot normalize relative rotations: {key} that's converted to {self.target_rotations[key]}"
+                    )
+            # If the state is not continuous, we should not use normalization modes other than binary
+            elif (
+                not self.modality_metadata[key].continuous
+                and self.normalization_modes[key] != "binary"
+            ):
+                raise ValueError(
+                    f"{key} is not continuous, so it should be normalized using `binary` mode"
+                )
+            # Initialize the normalizer
+            else:
+                statistics = self.normalization_statistics[key]
+            self._normalizers[key] = Normalizer(
+                mode=self.normalization_modes[key], statistics=statistics
+            )
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                # We allow some keys to be missing in the data, and only process the keys that are present
+                continue
+            if key not in self._input_dtypes:
+                input_dtype = data[key].dtype
+                assert isinstance(
+                    input_dtype, torch.dtype
+                ), f"Unexpected input dtype: {input_dtype}. Expected type: {torch.dtype}"
+                self._input_dtypes[key] = input_dtype
+            else:
+                assert (
+                    data[key].dtype == self._input_dtypes[key]
+                ), f"All states corresponding to the same key must be of the same dtype, input dtype: {data[key].dtype}, expected dtype: {self._input_dtypes[key]}"
+            # Rotate the state
+            state = data[key]
+            if key in self._rotation_transformers:
+                state = self._rotation_transformers[key].forward(state)
+            # Normalize the state
+            if key in self._normalizers:
+                state = self._normalizers[key].forward(state)
+            data[key] = state
+        return data
+
+    def unapply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            if key not in data:
+                continue
+            state = data[key]
+            assert isinstance(
+                state, torch.Tensor
+            ), f"Unexpected state type: {type(state)}. Expected type: {torch.Tensor}"
+            # Unnormalize the state
+            if key in self._normalizers:
+                state = self._normalizers[key].inverse(state)
+            # Change the state back to its original representation
+            if key in self._rotation_transformers:
+                state = self._rotation_transformers[key].inverse(state)
+            assert isinstance(
+                state, torch.Tensor
+            ), f"State should be tensor after unapplying transformations, but got {type(state)}"
+            # Only convert back to the original dtype if it's known, i.e. `apply` was called before
+            # If not, we don't know the original dtype, so we don't convert
+            if key in self._input_dtypes:
+                original_dtype = self._input_dtypes[key]
+                if isinstance(original_dtype, np.dtype):
+                    state = state.numpy().astype(original_dtype)
+                elif isinstance(original_dtype, torch.dtype):
+                    state = state.to(original_dtype)
+                else:
+                    raise ValueError(f"Invalid input dtype: {original_dtype}")
+            data[key] = state
+        return data
+
+
+class StateActionPerturbation(ModalityTransform):
+    """
+    Class for state or action perturbation.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        std (float): Standard deviation of the noise to be added to the state or action.
+    """
+
+    # Configurable attributes
+    std: float = Field(
+        ..., description="Standard deviation of the noise to be added to the state or action."
+    )
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if not self.training:
+            # Don't perturb the data in eval mode
+            return data
+        if self.std < 0:
+            # If the std is negative, we don't add any noise
+            return data
+        for key in self.apply_to:
+            state = data[key]
+            assert isinstance(state, torch.Tensor)
+            transformed_data_min = torch.min(state)
+            transformed_data_max = torch.max(state)
+            noise = torch.randn_like(state) * self.std
+            state += noise
+            # Clip to the original range
+            state = torch.clamp(state, transformed_data_min, transformed_data_max)
+            data[key] = state
+        return data
+
+
+class StateActionDropout(ModalityTransform):
+    """
+    Class for state or action dropout.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+        dropout_prob (float): Probability of dropping out a state or action.
+    """
+
+    # Configurable attributes
+    dropout_prob: float = Field(..., description="Probability of dropping out a state or action.")
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if not self.training:
+            # Don't drop out the data in eval mode
+            return data
+        if self.dropout_prob < 0:
+            # If the dropout probability is negative, we don't drop out any states
+            return data
+        if self.dropout_prob > 1e-9 and random.random() < self.dropout_prob:
+            for key in self.apply_to:
+                state = data[key]
+                assert isinstance(state, torch.Tensor)
+                state = torch.zeros_like(state)
+                data[key] = state
+        return data
+
+
+class StateActionSinCosTransform(ModalityTransform):
+    """
+    Class for state or action sin-cos transform.
+
+    Args:
+        apply_to (list[str]): The keys in the modality to load and transform.
+    """
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        for key in self.apply_to:
+            state = data[key]
+            assert isinstance(state, torch.Tensor)
+            sin_state = torch.sin(state)
+            cos_state = torch.cos(state)
+            data[key] = torch.cat([sin_state, cos_state], dim=-1)
+        return data
diff --git a/code/dataloader_bak/gr00t_lerobot/transform/video.py b/code/dataloader_bak/gr00t_lerobot/transform/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..15310f697259d7ee6ed8eac7e5abaca211b81dc9
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/transform/video.py
@@ -0,0 +1,612 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, ClassVar, Literal
+
+import albumentations as A
+import cv2
+import numpy as np
+import torch
+import torchvision.transforms.v2 as T
+from einops import rearrange
+from pydantic import Field, PrivateAttr, field_validator
+from PIL import Image
+
+from ..schema import DatasetMetadata
+from .base import ModalityTransform
+
+
+class VideoTransform(ModalityTransform):
+    # Configurable attributes
+    backend: str = Field(
+        default="torchvision", description="The backend to use for the transformations"
+    )
+
+    # Model variables
+    _train_transform: Callable | None = PrivateAttr(default=None)
+    _eval_transform: Callable | None = PrivateAttr(default=None)
+    _original_resolutions: dict[str, tuple[int, int]] = PrivateAttr(default_factory=dict)
+
+    # Model constants
+    _INTERPOLATION_MAP: ClassVar[dict[str, dict[str, Any]]] = PrivateAttr(
+        {
+            "nearest": {
+                "albumentations": cv2.INTER_NEAREST,
+                "torchvision": T.InterpolationMode.NEAREST,
+            },
+            "linear": {
+                "albumentations": cv2.INTER_LINEAR,
+                "torchvision": T.InterpolationMode.BILINEAR,
+            },
+            "cubic": {
+                "albumentations": cv2.INTER_CUBIC,
+                "torchvision": T.InterpolationMode.BICUBIC,
+            },
+            "area": {
+                "albumentations": cv2.INTER_AREA,
+                "torchvision": None,  # Torchvision does not support this interpolation mode
+            },
+            "lanczos4": {
+                "albumentations": cv2.INTER_LANCZOS4,  # Lanczos with a 4x4 filter
+                "torchvision": T.InterpolationMode.LANCZOS,  # Torchvision does not specify filter size, might be different from 4x4
+            },
+            "linear_exact": {
+                "albumentations": cv2.INTER_LINEAR_EXACT,
+                "torchvision": None,  # Torchvision does not support this interpolation mode
+            },
+            "nearest_exact": {
+                "albumentations": cv2.INTER_NEAREST_EXACT,
+                "torchvision": T.InterpolationMode.NEAREST_EXACT,
+            },
+            "max": {
+                "albumentations": cv2.INTER_MAX,
+                "torchvision": None,
+            },
+        }
+    )
+
+    @property
+    def train_transform(self) -> Callable:
+        assert (
+            self._train_transform is not None
+        ), "Transform is not set. Please call set_metadata() before calling apply()."
+        return self._train_transform
+
+    @train_transform.setter
+    def train_transform(self, value: Callable):
+        self._train_transform = value
+
+    @property
+    def eval_transform(self) -> Callable | None:
+        return self._eval_transform
+
+    @eval_transform.setter
+    def eval_transform(self, value: Callable | None):
+        self._eval_transform = value
+
+    @property
+    def original_resolutions(self) -> dict[str, tuple[int, int]]:
+        assert (
+            self._original_resolutions is not None
+        ), "Original resolutions are not set. Please call set_metadata() before calling apply()."
+        return self._original_resolutions
+
+    @original_resolutions.setter
+    def original_resolutions(self, value: dict[str, tuple[int, int]]):
+        self._original_resolutions = value
+
+    def check_input(self, data: dict[str, Any]):
+        if self.backend == "torchvision":
+            for key in self.apply_to:
+                assert isinstance(data[key], torch.Tensor), f"Video {key} is not a torch tensor"
+                assert data[key].ndim in [
+                    4,
+                    5,
+                ], f"Expected video {key} to have 4 or 5 dimensions (T, C, H, W or T, B, C, H, W), got {data[key].ndim}"
+        elif self.backend == "albumentations":
+            for key in self.apply_to:
+                assert isinstance(data[key], np.ndarray), f"Video {key} is not a numpy array"
+                assert data[key].ndim in [
+                    4,
+                    5,
+                ], f"Expected video {key} to have 4 or 5 dimensions (T, C, H, W or T, B, C, H, W), got {data[key].ndim}"
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def set_metadata(self, dataset_metadata: DatasetMetadata):
+        super().set_metadata(dataset_metadata)
+        self.original_resolutions = {}
+        for key in self.apply_to:
+            split_keys = key.split(".")
+            assert len(split_keys) == 2, f"Invalid key: {key}. Expected format: modality.key"
+            sub_key = split_keys[1]
+            if sub_key in dataset_metadata.modalities.video:
+                self.original_resolutions[key] = dataset_metadata.modalities.video[
+                    sub_key
+                ].resolution
+            else:
+                raise ValueError(
+                    f"Video key {sub_key} not found in dataset metadata. Available keys: {dataset_metadata.modalities.video.keys()}"
+                )
+        train_transform = self.get_transform(mode="train")
+        eval_transform = self.get_transform(mode="eval")
+        if self.backend == "albumentations":
+            self.train_transform = A.ReplayCompose(transforms=[train_transform])  # type: ignore
+            if eval_transform is not None:
+                self.eval_transform = A.ReplayCompose(transforms=[eval_transform])  # type: ignore
+        else:
+            assert train_transform is not None, "Train transform must be set"
+            self.train_transform = train_transform
+            self.eval_transform = eval_transform
+
+    def apply(self, data: dict[str, Any]) -> dict[str, Any]:
+        if self.training:
+            transform = self.train_transform
+        else:
+            transform = self.eval_transform
+            if transform is None:
+                return data
+        assert (
+            transform is not None
+        ), "Transform is not set. Please call set_metadata() before calling apply()."
+        try:
+            self.check_input(data)
+        except AssertionError as e:
+            raise ValueError(
+                f"Input data does not match the expected format for {self.__class__.__name__}: {e}"
+            ) from e
+
+        # Concatenate views
+        views = [data[key] for key in self.apply_to]
+        num_views = len(views)
+        is_batched = views[0].ndim == 5
+        bs = views[0].shape[0] if is_batched else 1
+        if isinstance(views[0], torch.Tensor):
+            views = torch.cat(views, 0)
+        elif isinstance(views[0], np.ndarray):
+            views = np.concatenate(views, 0)
+        else:
+            raise ValueError(f"Unsupported view type: {type(views[0])}")
+        if is_batched:
+            views = rearrange(views, "(v b) t c h w -> (v b t) c h w", v=num_views, b=bs)
+        # Apply the transform
+        if self.backend == "torchvision":
+            views = transform(views)
+        elif self.backend == "albumentations":
+            assert isinstance(transform, A.ReplayCompose), "Transform must be a ReplayCompose"
+            first_frame = views[0]
+            transformed = transform(image=first_frame)
+            replay_data = transformed["replay"]
+            transformed_first_frame = transformed["image"]
+
+            if len(views) > 1:
+                # Apply the same transformations to the rest of the frames
+                transformed_frames = [
+                    transform.replay(replay_data, image=frame)["image"] for frame in views[1:]
+                ]
+                # Add the first frame back
+                transformed_frames = [transformed_first_frame] + transformed_frames
+            else:
+                # If there is only one frame, just make a list with one frame
+                transformed_frames = [transformed_first_frame]
+
+            # Delete the replay data to save memory
+            del replay_data
+            views = np.stack(transformed_frames, 0)
+
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+        # Split views
+        if is_batched:
+            views = rearrange(views, "(v b t) c h w -> v b t c h w", v=num_views, b=bs)
+        else:
+            views = rearrange(views, "(v t) c h w -> v t c h w", v=num_views)
+        for key, view in zip(self.apply_to, views):
+            data[key] = view
+        return data
+
+    @classmethod
+    def _validate_interpolation(cls, interpolation: str):
+        if interpolation not in cls._INTERPOLATION_MAP:
+            raise ValueError(f"Interpolation mode {interpolation} not supported")
+
+    def _get_interpolation(self, interpolation: str, backend: str = "torchvision"):
+        """
+        Get the interpolation mode for the given backend.
+
+        Args:
+            interpolation (str): The interpolation mode.
+            backend (str): The backend to use.
+
+        Returns:
+            Any: The interpolation mode for the given backend.
+        """
+        return self._INTERPOLATION_MAP[interpolation][backend]
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        raise NotImplementedError(
+            "set_transform is not implemented for VideoTransform. Please implement this function to set the transforms."
+        )
+
+
+class VideoCrop(VideoTransform):
+    height: int | None = Field(default=None, description="The height of the input image")
+    width: int | None = Field(default=None, description="The width of the input image")
+    scale: float = Field(
+        ...,
+        description="The scale of the crop. The crop size is (width * scale, height * scale)",
+    )
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the transform for the given mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: If mode is "train", return a random crop transform. If mode is "eval", return a center crop transform.
+        """
+        # 1. Check the input resolution
+        assert (
+            len(set(self.original_resolutions.values())) == 1
+        ), f"All video keys must have the same resolution, got: {self.original_resolutions}"
+        if self.height is None:
+            assert self.width is None, "Height and width must be either both provided or both None"
+            self.width, self.height = self.original_resolutions[self.apply_to[0]]
+        else:
+            assert (
+                self.width is not None
+            ), "Height and width must be either both provided or both None"
+        # 2. Create the transform
+        size = (int(self.height * self.scale), int(self.width * self.scale))
+        if self.backend == "torchvision":
+            if mode == "train":
+                return T.RandomCrop(size)
+            elif mode == "eval":
+                return T.CenterCrop(size)
+            else:
+                raise ValueError(f"Crop mode {mode} not supported")
+        elif self.backend == "albumentations":
+            if mode == "train":
+                return A.RandomCrop(height=size[0], width=size[1], p=1)
+            elif mode == "eval":
+                return A.CenterCrop(height=size[0], width=size[1], p=1)
+            else:
+                raise ValueError(f"Crop mode {mode} not supported")
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def check_input(self, data: dict[str, Any]):
+        super().check_input(data)
+        # Check the input resolution
+        for key in self.apply_to:
+            if self.backend == "torchvision":
+                height, width = data[key].shape[-2:]
+            elif self.backend == "albumentations":
+                height, width = data[key].shape[-3:-1]
+            else:
+                raise ValueError(f"Backend {self.backend} not supported")
+            assert (
+                height == self.height and width == self.width
+            ), f"Video {key} has invalid shape {height, width}, expected {self.height, self.width}"
+
+
+class VideoResize(VideoTransform):
+    height: int = Field(..., description="The height of the resize")
+    width: int = Field(..., description="The width of the resize")
+    interpolation: str = Field(default="linear", description="The interpolation mode")
+    antialias: bool = Field(default=True, description="Whether to apply antialiasing")
+
+    @field_validator("interpolation")
+    def validate_interpolation(cls, v):
+        cls._validate_interpolation(v)
+        return v
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the resize transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The resize transform.
+        """
+        interpolation = self._get_interpolation(self.interpolation, self.backend)
+        if interpolation is None:
+            raise ValueError(
+                f"Interpolation mode {self.interpolation} not supported for torchvision"
+            )
+        if self.backend == "torchvision":
+            size = (self.height, self.width)
+            return T.Resize(size, interpolation=interpolation, antialias=self.antialias)
+        elif self.backend == "albumentations":
+            return A.Resize(
+                height=self.height,
+                width=self.width,
+                interpolation=interpolation,
+                p=1,
+            )
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomRotation(VideoTransform):
+    degrees: float | tuple[float, float] = Field(
+        ..., description="The degrees of the random rotation"
+    )
+    interpolation: str = Field("linear", description="The interpolation mode")
+
+    @field_validator("interpolation")
+    def validate_interpolation(cls, v):
+        cls._validate_interpolation(v)
+        return v
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the random rotation transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: The random rotation transform. None for eval mode.
+        """
+        if mode == "eval":
+            return None
+        interpolation = self._get_interpolation(self.interpolation, self.backend)
+        if interpolation is None:
+            raise ValueError(
+                f"Interpolation mode {self.interpolation} not supported for torchvision"
+            )
+        if self.backend == "torchvision":
+            return T.RandomRotation(self.degrees, interpolation=interpolation)  # type: ignore
+        elif self.backend == "albumentations":
+            return A.Rotate(limit=self.degrees, interpolation=interpolation, p=1)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoHorizontalFlip(VideoTransform):
+    p: float = Field(..., description="The probability of the horizontal flip")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the horizontal flip transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a horizontal flip transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomHorizontalFlip(self.p)
+        elif self.backend == "albumentations":
+            return A.HorizontalFlip(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoGrayscale(VideoTransform):
+    p: float = Field(..., description="The probability of the grayscale transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the grayscale transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a grayscale transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomGrayscale(self.p)
+        elif self.backend == "albumentations":
+            return A.ToGray(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoColorJitter(VideoTransform):
+    brightness: float | tuple[float, float] = Field(
+        ..., description="The brightness of the color jitter"
+    )
+    contrast: float | tuple[float, float] = Field(
+        ..., description="The contrast of the color jitter"
+    )
+    saturation: float | tuple[float, float] = Field(
+        ..., description="The saturation of the color jitter"
+    )
+    hue: float | tuple[float, float] = Field(..., description="The hue of the color jitter")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the color jitter transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a color jitter transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.ColorJitter(
+                brightness=self.brightness,
+                contrast=self.contrast,
+                saturation=self.saturation,
+                hue=self.hue,
+            )
+        elif self.backend == "albumentations":
+            return A.ColorJitter(
+                brightness=self.brightness,
+                contrast=self.contrast,
+                saturation=self.saturation,
+                hue=self.hue,
+                p=1,
+            )
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomGrayscale(VideoTransform):
+    p: float = Field(..., description="The probability of the grayscale transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the grayscale transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a grayscale transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomGrayscale(self.p)
+        elif self.backend == "albumentations":
+            return A.ToGray(p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoRandomPosterize(VideoTransform):
+    bits: int = Field(..., description="The number of bits to posterize the image")
+    p: float = Field(..., description="The probability of the posterize transformation")
+
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable | None:
+        """Get the posterize transform, only used in train mode.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable | None: If mode is "train", return a posterize transform. If mode is "eval", return None.
+        """
+        if mode == "eval":
+            return None
+        if self.backend == "torchvision":
+            return T.RandomPosterize(bits=self.bits, p=self.p)
+        elif self.backend == "albumentations":
+            return A.Posterize(num_bits=self.bits, p=self.p)
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+
+class VideoToTensor(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to tensor transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to tensor transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_tensor
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    def check_input(self, data: dict):
+        """Check if the input data has the correct shape.
+        Expected video shape: [T, H, W, C], dtype np.uint8
+        """
+        for key in self.apply_to:
+            assert key in data, f"Key {key} not found in data. Available keys: {data.keys()}"
+            assert data[key].ndim in [
+                4,
+                5,
+            ], f"Video {key} must have 4 or 5 dimensions, got {data[key].ndim}"
+            assert (
+                data[key].dtype == np.uint8
+            ), f"Video {key} must have dtype uint8, got {data[key].dtype}"
+            input_resolution = data[key].shape[-3:-1][::-1]
+            if key in self.original_resolutions:
+                expected_resolution = self.original_resolutions[key]
+            else:
+                expected_resolution = input_resolution
+            assert (
+                input_resolution == expected_resolution
+            ), f"Video {key} has invalid resolution {input_resolution}, expected {expected_resolution}. Full shape: {data[key].shape}"
+
+    @staticmethod
+    def to_tensor(frames: np.ndarray) -> torch.Tensor:
+        """Convert numpy array to tensor efficiently.
+
+        Args:
+            frames: numpy array of shape [T, H, W, C] in uint8 format
+        Returns:
+            tensor of shape [T, C, H, W] in range [0, 1]
+        """
+        frames_tensor = torch.from_numpy(frames).to(torch.float32) / 255.0
+        return frames_tensor.permute(0, 3, 1, 2)  # [T, C, H, W]
+
+
+class VideoToNumpy(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to numpy transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to numpy transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_numpy
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    @staticmethod
+    def to_numpy(frames: torch.Tensor) -> np.ndarray:
+        """Convert tensor back to numpy array efficiently.
+
+        Args:
+            frames: tensor of shape [T, C, H, W] in range [0, 1]
+        Returns:
+            numpy array of shape [T, H, W, C] in uint8 format
+        """
+        return (frames.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy()
+
+class VideoToPIL(VideoTransform):
+    def get_transform(self, mode: Literal["train", "eval"] = "train") -> Callable:
+        """Get the to PIL transform. Same transform for both train and eval.
+
+        Args:
+            mode (Literal["train", "eval"]): The mode to get the transform for.
+
+        Returns:
+            Callable: The to PIL transform.
+        """
+        if self.backend == "torchvision":
+            return self.__class__.to_pil
+        else:
+            raise ValueError(f"Backend {self.backend} not supported")
+
+    @staticmethod
+    def to_pil(frames: torch.Tensor) -> Image.Image:
+        """Convert tensor back to PIL Image.
+
+        Args:
+            frames: tensor of shape [T, C, H, W] in range [0, 1]
+        Returns:
+            PIL Image of shape [T, H, W, C] in uint8 format
+        """
+        # video PIL format?
+        return Image.fromarray((frames.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy())
\ No newline at end of file
diff --git a/code/dataloader_bak/gr00t_lerobot/video.py b/code/dataloader_bak/gr00t_lerobot/video.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1bf2db18e81f223dc0f489411d42832c008f6e
--- /dev/null
+++ b/code/dataloader_bak/gr00t_lerobot/video.py
@@ -0,0 +1,241 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import av
+import cv2
+import numpy as np
+
+import torch  # noqa: F401 # isort: skip
+import torchvision  # noqa: F401 # isort: skip
+
+# Import decord with graceful fallback
+try:
+    import decord  # noqa: F401
+
+    DECORD_AVAILABLE = True
+except ImportError:
+    DECORD_AVAILABLE = False
+
+try:
+    import torchcodec
+
+    TORCHCODEC_AVAILABLE = True
+except (ImportError, RuntimeError):
+    TORCHCODEC_AVAILABLE = False
+
+
+def get_frames_by_indices(
+    video_path: str,
+    indices: list[int] | np.ndarray,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+) -> np.ndarray:
+    if video_backend == "decord":
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        frames = vr.get_batch(indices)
+        return frames.asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        return decoder.get_frames_at(indices=indices).data.numpy()
+    elif video_backend == "opencv":
+        frames = []
+        cap = cv2.VideoCapture(video_path, **video_backend_kwargs)
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret:
+                raise ValueError(f"Unable to read frame at index {idx}")
+            frames.append(frame)
+        cap.release()
+        frames = np.array(frames)
+        return frames
+    else:
+        raise NotImplementedError
+
+
+def get_frames_by_timestamps(
+    video_path: str,
+    timestamps: list[float] | np.ndarray,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+) -> np.ndarray:
+    """Get frames from a video at specified timestamps.
+    Args:
+        video_path (str): Path to the video file.
+        timestamps (list[int] | np.ndarray): Timestamps to retrieve frames for, in seconds.
+        video_backend (str, optional): Video backend to use. Defaults to "decord".
+    Returns:
+        np.ndarray: Frames at the specified timestamps.
+    """
+    if video_backend == "decord":
+        # For some GPUs, AV format data cannot be read
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        num_frames = len(vr)
+        # Retrieve the timestamps for each frame in the video
+        frame_ts: np.ndarray = vr.get_frame_timestamp(range(num_frames))
+        # Map each requested timestamp to the closest frame index
+        # Only take the first element of the frame_ts array which corresponds to start_seconds
+        indices = np.abs(frame_ts[:, :1] - timestamps).argmin(axis=0)
+        frames = vr.get_batch(indices)
+        return frames.asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        return decoder.get_frames_played_at(seconds=timestamps).data.numpy()
+    elif video_backend == "opencv":
+        # Open the video file
+        cap = cv2.VideoCapture(video_path, **video_backend_kwargs)
+        if not cap.isOpened():
+            raise ValueError(f"Unable to open video file: {video_path}")
+        # Retrieve the total number of frames
+        num_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+        # Calculate timestamps for each frame
+        fps = cap.get(cv2.CAP_PROP_FPS)
+        frame_ts = np.arange(num_frames) / fps
+        frame_ts = frame_ts[:, np.newaxis]  # Reshape to (num_frames, 1) for broadcasting
+        # Map each requested timestamp to the closest frame index
+        indices = np.abs(frame_ts - timestamps).argmin(axis=0)
+        frames = []
+        for idx in indices:
+            cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
+            ret, frame = cap.read()
+            if not ret:
+                raise ValueError(f"Unable to read frame at index {idx}")
+            frames.append(frame)
+        cap.release()
+        frames = np.array(frames)
+        return frames
+    elif video_backend == "torchvision_av":
+        torchvision.set_video_backend("pyav")
+        loaded_frames = []
+        loaded_ts = []
+        
+        reader = None
+        try:
+            reader = torchvision.io.VideoReader(video_path, "video")
+            
+            for target_ts in timestamps:
+                # Reset reader state
+                reader.seek(target_ts, keyframes_only=True)
+                
+                closest_frame = None
+                closest_ts_diff = float('inf')
+                
+                for frame in reader:
+                    current_ts = frame["pts"]
+                    current_diff = abs(current_ts - target_ts)
+                    
+                    if closest_frame is None:
+                        closest_frame = frame
+                    
+                    if current_diff < closest_ts_diff:
+                        # Release the previous frame
+                        if closest_frame is not None:
+                            del closest_frame
+                        closest_ts_diff = current_diff
+                        closest_frame = frame
+                    else:
+                        # The time difference starts to increase, stop searching
+                        break
+                
+                if closest_frame is not None:
+                    frame_data = closest_frame["data"]
+                    if isinstance(frame_data, torch.Tensor):
+                        frame_data = frame_data.cpu().numpy()
+                    loaded_frames.append(frame_data)
+                    loaded_ts.append(closest_frame["pts"])
+                    
+                    # Immediately release frame reference
+                    del closest_frame
+                    
+        finally:
+            # Thoroughly clean resources
+            if reader is not None:
+                if hasattr(reader, '_c'):
+                    reader._c = None
+                if hasattr(reader, 'container'):
+                    reader.container.close()
+                    reader.container = None
+            # Force garbage collection
+            import gc
+            gc.collect()
+        
+        frames = np.array(loaded_frames)
+        return frames.transpose(0, 2, 3, 1)
+    else:
+        raise NotImplementedError
+
+
+def get_all_frames(
+    video_path: str,
+    video_backend: str = "decord",
+    video_backend_kwargs: dict = {},
+    resize_size: tuple[int, int] | None = None,
+) -> np.ndarray:
+    """Get all frames from a video.
+    Args:
+        video_path (str): Path to the video file.
+        video_backend (str, optional): Video backend to use. Defaults to "decord".
+        video_backend_kwargs (dict, optional): Keyword arguments for the video backend.
+        resize_size (tuple[int, int], optional): Resize size for the frames. Defaults to None.
+    """
+    if video_backend == "decord":
+        if not DECORD_AVAILABLE:
+            raise ImportError("decord is not available.")
+        vr = decord.VideoReader(video_path, **video_backend_kwargs)
+        frames = vr.get_batch(range(len(vr))).asnumpy()
+    elif video_backend == "torchcodec":
+        if not TORCHCODEC_AVAILABLE:
+            raise ImportError("torchcodec is not available.")
+        decoder = torchcodec.decoders.VideoDecoder(
+            video_path, device="cpu", dimension_order="NHWC", num_ffmpeg_threads=0
+        )
+        frames = decoder.get_frames_at(indices=range(len(decoder)))
+        return frames.data.numpy(), frames.pts_seconds.numpy()
+    elif video_backend == "pyav":
+        container = av.open(video_path)
+        frames = []
+        for frame in container.decode(video=0):
+            frame = frame.to_ndarray(format="rgb24")
+            frames.append(frame)
+        frames = np.array(frames)
+    elif video_backend == "torchvision_av":
+        # set backend and reader
+        torchvision.set_video_backend("pyav")
+        reader = torchvision.io.VideoReader(video_path, "video")
+        frames = []
+        for frame in reader:
+            frames.append(frame["data"].numpy())
+        frames = np.array(frames)
+        frames = frames.transpose(0, 2, 3, 1)
+    else:
+        raise NotImplementedError(f"Video backend {video_backend} not implemented")
+    # resize frames if specified
+    if resize_size is not None:
+        frames = [cv2.resize(frame, resize_size) for frame in frames]
+        frames = np.array(frames)
+    return frames
\ No newline at end of file
diff --git a/code/dataloader_bak/lerobot_datasets.py b/code/dataloader_bak/lerobot_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3bcf67ae90c64091e62aca942894f86d70be6aa
--- /dev/null
+++ b/code/dataloader_bak/lerobot_datasets.py
@@ -0,0 +1,140 @@
+# Copyright 2025 NVIDIA Corp. and affiliates. All rights reserved.
+# Modified by [Fangjing Wang/ SUST University] in [2025]. 
+# Modification: [return raw data and suport multi-dataset mixture].
+# Modified by [Jinhui YE/ HKUST University] in [2025]. 
+# Modification: [suport topdowm processing, suport param from config].
+
+from pathlib import Path
+from typing import Sequence
+from omegaconf import OmegaConf
+
+from starVLA.dataloader.gr00t_lerobot.datasets import LeRobotSingleDataset, LeRobotMixtureDataset
+from starVLA.dataloader.gr00t_lerobot.mixtures import DATASET_NAMED_MIXTURES
+from starVLA.dataloader.gr00t_lerobot.data_config import get_robot_type_config_map
+from starVLA.dataloader.gr00t_lerobot.embodiment_tags import ROBOT_TYPE_TO_EMBODIMENT_TAG, EmbodimentTag
+
+def collate_fn(batch):
+    return batch
+
+def make_LeRobotSingleDataset(
+    data_root_dir: Path | str,
+    data_name: str,
+    robot_type: str,
+    delete_pause_frame: bool = False,
+    data_cfg: dict | None = None,
+) -> LeRobotSingleDataset:
+    """
+    Make a LeRobotSingleDataset object.
+
+    :param data_root_dir: The root directory of the dataset.
+    :param data_name: The name of the dataset.
+    :param robot_type: The robot type config to use.
+    :param crop_obs_camera: Whether to crop the observation camera images.
+    :return: A LeRobotSingleDataset object.
+    """
+    chunk_size = data_cfg.get("chunk_size")
+    state_use_action_chunk = data_cfg.get("state_use_action_chunk")
+    data_config = get_robot_type_config_map(chunk_size=chunk_size, state_use_action_chunk=state_use_action_chunk)[robot_type]
+    modality_config = data_config.modality_config()
+    transforms = data_config.transform()
+    dataset_path = data_root_dir / data_name
+    if robot_type not in ROBOT_TYPE_TO_EMBODIMENT_TAG:
+        print(f"Warning: Robot type {robot_type} not found in ROBOT_TYPE_TO_EMBODIMENT_TAG, using {EmbodimentTag.NEW_EMBODIMENT} as default")
+        embodiment_tag = EmbodimentTag.NEW_EMBODIMENT
+    else:
+        embodiment_tag = ROBOT_TYPE_TO_EMBODIMENT_TAG[robot_type]
+    
+    video_backend = data_cfg.get("video_backend", "decord") if data_cfg else "decord"
+    
+    return LeRobotSingleDataset(
+        dataset_path=dataset_path,
+        modality_configs=modality_config,
+        transforms=transforms,
+        embodiment_tag=embodiment_tag,
+        video_backend=video_backend, # decord is more efficiency | torchvision_av for video.av1
+        delete_pause_frame=delete_pause_frame,
+        data_cfg=data_cfg,
+    )
+
+def get_vla_dataset(
+    data_cfg: dict,
+    mode: str = "train",
+    balance_dataset_weights: bool = False,
+    balance_trajectory_weights: bool = False,
+    seed: int = 42,
+    delete_pause_frame: bool = True,
+    **kwargs: dict,
+) -> LeRobotMixtureDataset:
+    """
+    Get a LeRobotMixtureDataset object.
+    """
+    data_root_dir = data_cfg.data_root_dir
+    data_mix = data_cfg.data_mix
+    mixture_spec = DATASET_NAMED_MIXTURES[data_mix]
+    included_datasets, filtered_mixture_spec = set(), []
+    for d_name, d_weight, robot_type in mixture_spec:  
+        dataset_key = (d_name, robot_type)  
+        if dataset_key in included_datasets:
+            print(f"Skipping Duplicate Dataset: `{(d_name, d_weight, robot_type)}`")
+            continue
+
+        included_datasets.add(dataset_key)
+        filtered_mixture_spec.append((d_name, d_weight, robot_type))
+
+    dataset_mixture = []
+    for d_name, d_weight, robot_type in filtered_mixture_spec:
+        dataset_mixture.append((make_LeRobotSingleDataset(Path(data_root_dir), d_name, robot_type, delete_pause_frame=delete_pause_frame, data_cfg=data_cfg), d_weight))
+
+    return LeRobotMixtureDataset(
+        dataset_mixture,
+        mode=mode,
+        balance_dataset_weights=balance_dataset_weights,
+        balance_trajectory_weights=balance_trajectory_weights,
+        seed=seed,
+        data_cfg=data_cfg,
+        **kwargs,
+    )
+
+
+
+if __name__ == "__main__":
+
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_behavior.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    args.config_yaml = "examples/LIBERO/train_files/starvla_cotrain_libero.yaml"
+    cfg = OmegaConf.load(args.config_yaml)
+
+    vla_dataset_cfg = cfg.datasets.vla_data
+    # vla_dataset_cfg.data_root_dir = "./playground/Datasets/behavior-1k"
+    # vla_dataset_cfg.include_state = True
+    # vla_dataset_cfg.data_mix = "BEHAVIOR_dual_base_depth"
+    vla_dataset_cfg.task_id = 1
+    for task_id in ["all"]:
+        # 11,26,36,37
+        # 5,11,13,26,36,27,43,44,45,46
+        # 2,3,5,11,13,25,26,27,
+        # 3,5,11,13, / 14,15,16,17, / 19,20,23,25, / 26,27,30,34, / 36,37,38,39, 41,42,43,44,45,46,47,49
+        vla_dataset_cfg.task_id = task_id
+        print(f"Testing Task ID: {task_id}")
+        dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+        # dataset
+    from torch.utils.data import DataLoader
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1, # For Debug
+        collate_fn=collate_fn,
+    )
+
+    from tqdm import tqdm
+    count = 1
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        # print(batch)
+        # print(1)
+        if count > 1:
+            break
+        count += 1
+        pass
\ No newline at end of file
diff --git a/code/dataloader_bak/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc b/code/dataloader_bak/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0343b52aeb3caf567010067354f61e471a4089e0
Binary files /dev/null and b/code/dataloader_bak/qwenvl_llavajson/__pycache__/qwen_data_config.cpython-310.pyc differ
diff --git a/code/dataloader_bak/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc b/code/dataloader_bak/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84b233b414798b0b9d42e91c2797d4e28ef148c1
Binary files /dev/null and b/code/dataloader_bak/qwenvl_llavajson/__pycache__/rope2d.cpython-310.pyc differ
diff --git a/code/dataloader_bak/qwenvl_llavajson/qwen_data_config.py b/code/dataloader_bak/qwenvl_llavajson/qwen_data_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..c690f7ba14e9b6ec5f5bfaab310abaf867505b96
--- /dev/null
+++ b/code/dataloader_bak/qwenvl_llavajson/qwen_data_config.py
@@ -0,0 +1,44 @@
+import re
+
+from pathlib import Path
+
+# You can add multimodal datasets here and register a short nickname to ${data_dict}.
+# The data format should follow the general multimodal VLM format, for example:
+# https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-finetune/README.md
+
+json_root = f"./playground/Datasets/LLaVA-OneVision-COCO/llava_jsons"
+image_root = f"./playground/Datasets/LLaVA-OneVision-COCO/images"
+
+SHAREGPT4V_COCO = {
+    "annotation_path": f"{json_root}/sharegpt4v_coco.json",
+    "data_path": f"{image_root}/",
+}
+
+data_dict = {
+    "sharegpt4v_coco": SHAREGPT4V_COCO,
+}
+
+def parse_sampling_rate(dataset_name):
+    match = re.search(r"%(\d+)$", dataset_name)
+    if match:
+        return int(match.group(1)) / 100.0
+    return 1.0
+
+def data_list(dataset_names):
+    if dataset_names == ["all"]:
+        dataset_names = list(data_dict.keys())
+    config_list = []
+    for dataset_name in dataset_names:
+        sampling_rate = parse_sampling_rate(dataset_name)
+        dataset_name = re.sub(r"%(\d+)$", "", dataset_name)
+        if dataset_name in data_dict.keys():
+            config = data_dict[dataset_name].copy()
+            config["sampling_rate"] = sampling_rate
+            config_list.append(config)
+        else:
+            raise ValueError(f"do not find {dataset_name}")
+    return config_list
+
+if __name__ == "__main__":
+    print(data_list)
+    
diff --git a/code/dataloader_bak/qwenvl_llavajson/rope2d.py b/code/dataloader_bak/qwenvl_llavajson/rope2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab4da85174dcc7da97bc27264858965628773296
--- /dev/null
+++ b/code/dataloader_bak/qwenvl_llavajson/rope2d.py
@@ -0,0 +1,351 @@
+import os
+import copy
+import json
+import random
+import logging
+import re
+import time
+import math
+import ast
+from dataclasses import dataclass, field
+from typing import Dict, Optional, Sequence, List, Tuple
+from io import BytesIO
+import base64
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from decord import VideoReader
+import transformers
+
+
+def get_rope_index_25(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with modern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embedding for text part.
+        Examples:
+            Temporal (Time): 3 patches, representing different segments of the video in time.
+            Height: 2 patches, dividing each frame vertically.
+            Width: 2 patches, dividing each frame horizontally.
+            We also have some important parameters:
+            fps (Frames Per Second): The video's frame rate, set to 1. This means one frame is processed each second.
+            tokens_per_second: This is a crucial parameter. It dictates how many "time-steps" or "temporal tokens" are conceptually packed into a one-second interval of the video. In this case, we have 25 tokens per second. So each second of the video will be represented with 25 separate time points. It essentially defines the temporal granularity.
+            temporal_patch_size: The number of frames that compose one temporal patch. Here, it's 2 frames.
+            interval: The step size for the temporal position IDs, calculated as tokens_per_second * temporal_patch_size / fps. In this case, 25 * 2 / 1 = 50. This means that each temporal patch will be have a difference of 50 in the temporal position IDs.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 50, 50, 50, 50, 100, 100, 100, 100]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [101, 102, 103, 104, 105]
+            text height position_ids: [101, 102, 103, 104, 105]
+            text width position_ids: [101, 102, 103, 104, 105]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
+            The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        attention_mask = attention_mask.to(total_input_ids.device)
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    second_per_grid_t = 0
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    if second_per_grid_ts is not None:
+                        second_per_grid_t = second_per_grid_ts[video_index]
+                    else:
+                        second_per_grid_t = 1.0
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                range_tensor = torch.arange(llm_grid_t).view(-1, 1)
+                expanded_range = range_tensor.expand(-1, llm_grid_h * llm_grid_w)
+
+                time_tensor = expanded_range * second_per_grid_t * 2
+
+                time_tensor_long = time_tensor.long()
+                t_index = time_tensor_long.flatten()
+
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
+
+
+def get_rope_index_2(
+    spatial_merge_size: Optional[int] = 2,
+    input_ids: Optional[torch.LongTensor] = None,
+    image_grid_thw: Optional[torch.LongTensor] = None,
+    video_grid_thw: Optional[torch.LongTensor] = None,
+    second_per_grid_ts: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Calculate the 3D rope index based on image and video's temporal, height and width in LLM.
+
+    Explanation:
+        Each embedding sequence contains vision embedding and text embedding or just contains text embedding.
+
+        For pure text embedding sequence, the rotary position embedding has no difference with mordern LLMs.
+        Examples:
+            input_ids: [T T T T T], here T is for text.
+            temporal position_ids: [0, 1, 2, 3, 4]
+            height position_ids: [0, 1, 2, 3, 4]
+            width position_ids: [0, 1, 2, 3, 4]
+
+        For vision and text embedding sequence, we calculate 3D rotary position embedding for vision part
+        and 1D rotary position embeddin for text part.
+        Examples:
+            Assume we have a video input with 3 temporal patches, 2 height patches and 2 width patches.
+            input_ids: [V V V V V V V V V V V V T T T T T], here V is for vision.
+            vision temporal position_ids: [0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]
+            vision height position_ids: [0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1]
+            vision width position_ids: [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]
+            text temporal position_ids: [3, 4, 5, 6, 7]
+            text height position_ids: [3, 4, 5, 6, 7]
+            text width position_ids: [3, 4, 5, 6, 7]
+            Here we calculate the text start position_ids as the max vision position_ids plus 1.
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+        image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
+            The temporal, height and width of feature shape of each image in LLM.
+        video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
+            The temporal, height and width of feature shape of each video in LLM.
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+    Returns:
+        position_ids (`torch.LongTensor` of shape `(3, batch_size, sequence_length)`)
+        mrope_position_deltas (`torch.Tensor` of shape `(batch_size)`)
+    """
+    image_token_id = 151655
+    video_token_id = 151656
+    vision_start_token_id = 151652
+    mrope_position_deltas = []
+    if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None):
+        total_input_ids = input_ids
+        if attention_mask is None:
+            attention_mask = torch.ones_like(total_input_ids)
+        position_ids = torch.ones(
+            3,
+            input_ids.shape[0],
+            input_ids.shape[1],
+            dtype=input_ids.dtype,
+            device=input_ids.device,
+        )
+        image_index, video_index = 0, 0
+        for i, input_ids in enumerate(total_input_ids):
+            input_ids = input_ids[attention_mask[i] == 1]
+            image_nums, video_nums = 0, 0
+            vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1)
+            vision_tokens = input_ids[vision_start_indices + 1]
+            image_nums = (vision_tokens == image_token_id).sum()
+            video_nums = (vision_tokens == video_token_id).sum()
+            input_tokens = input_ids.tolist()
+            llm_pos_ids_list: list = []
+            st = 0
+            remain_images, remain_videos = image_nums, video_nums
+            for _ in range(image_nums + video_nums):
+                if image_token_id in input_tokens and remain_images > 0:
+                    ed_image = input_tokens.index(image_token_id, st)
+                else:
+                    ed_image = len(input_tokens) + 1
+                if video_token_id in input_tokens and remain_videos > 0:
+                    ed_video = input_tokens.index(video_token_id, st)
+                else:
+                    ed_video = len(input_tokens) + 1
+                if ed_image < ed_video:
+                    t, h, w = (
+                        image_grid_thw[image_index][0],
+                        image_grid_thw[image_index][1],
+                        image_grid_thw[image_index][2],
+                    )
+                    image_index += 1
+                    remain_images -= 1
+                    ed = ed_image
+                else:
+                    t, h, w = (
+                        video_grid_thw[video_index][0],
+                        video_grid_thw[video_index][1],
+                        video_grid_thw[video_index][2],
+                    )
+                    video_index += 1
+                    remain_videos -= 1
+                    ed = ed_video
+                llm_grid_t, llm_grid_h, llm_grid_w = (
+                    t.item(),
+                    h.item() // spatial_merge_size,
+                    w.item() // spatial_merge_size,
+                )
+                text_len = ed - st
+
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+                t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten()
+                h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten()
+                w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten()
+                llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx)
+                st = ed + llm_grid_t * llm_grid_h * llm_grid_w
+
+            if st < len(input_tokens):
+                st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0
+                text_len = len(input_tokens) - st
+                llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx)
+
+            llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1)
+            position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device)
+            mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
+        mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1)
+        return position_ids, mrope_position_deltas
+    else:
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device)
+            max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0]
+            mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1]
+        else:
+            position_ids = (
+                torch.arange(input_ids.shape[1], device=input_ids.device)
+                .view(1, 1, -1)
+                .expand(3, input_ids.shape[0], -1)
+            )
+            mrope_position_deltas = torch.zeros(
+                [input_ids.shape[0], 1],
+                device=input_ids.device,
+                dtype=input_ids.dtype,
+            )
+
+        return position_ids, mrope_position_deltas
diff --git a/code/dataloader_bak/vlm_datasets.py b/code/dataloader_bak/vlm_datasets.py
new file mode 100644
index 0000000000000000000000000000000000000000..422aeed3ec5352f8e85e4ab64e19747c24ef8aa8
--- /dev/null
+++ b/code/dataloader_bak/vlm_datasets.py
@@ -0,0 +1,658 @@
+import os
+import copy
+import json
+import random
+import logging
+import re
+import time
+import math
+import itertools
+import ast
+from dataclasses import dataclass
+from typing import Dict, Optional, Sequence, List, Tuple
+from io import BytesIO
+import base64
+from collections.abc import Sequence
+from types import SimpleNamespace
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+from decord import VideoReader
+import transformers
+from omegaconf import OmegaConf
+from starVLA.dataloader.qwenvl_llavajson.qwen_data_config import data_list
+from starVLA.dataloader.qwenvl_llavajson.rope2d import get_rope_index_25, get_rope_index_2
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>\n"
+DEFAULT_VIDEO_TOKEN = "<video>\n"
+
+local_rank = None
+
+
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+
+
+def read_jsonl(path):
+    with open(path, "r") as f:
+        return [json.loads(line) for line in f]
+
+
+def preprocess_qwen_2_visual(
+    sources,
+    tokenizer: transformers.PreTrainedTokenizer,
+    grid_thw: List = [],
+    visual_type: str = "image",
+) -> Dict:
+    roles = {"human": "user", "gpt": "assistant"}
+    system_message = "You are a helpful assistant."
+    if visual_type not in ["image", "video"]:
+        raise ValueError("visual_type must be either 'image' or 'video'")
+
+    tokenizer = copy.deepcopy(tokenizer)
+    chat_template = "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
+    tokenizer.chat_template = chat_template
+
+    visual_replicate_index = 0
+    input_ids, targets = [], []
+
+    for i, source in enumerate(sources):
+        try:
+            if roles[source[0]["from"]] != roles["human"]:
+                source = source[1:]
+        except:
+            print(sources)
+
+        input_id, target = [], []
+
+        input_id += tokenizer.apply_chat_template([{"role": "system", "content": system_message}])
+        target += [IGNORE_INDEX] * len(input_id)
+
+        for conv in source:
+            try:
+                role = conv["role"]
+                content = conv["content"]
+            except:
+                role = conv["from"]
+                content = conv["value"]
+
+            role = roles.get(role, role)
+            if role == "user":
+                visual_tag = f"<{visual_type}>"
+                if visual_tag in content:
+                    parts = content.split(visual_tag)
+                    new_parts = []
+                    for i in range(len(parts) - 1):
+                        new_parts.append(parts[i])
+                        replacement = (
+                            "<|vision_start|>"
+                            + f"<|{visual_type}_pad|>" * grid_thw[visual_replicate_index]
+                            + "<|vision_end|>"
+                        )
+                        new_parts.append(replacement)
+                        visual_replicate_index += 1
+                    new_parts.append(parts[-1])
+                    content = "".join(new_parts)
+
+            conv = [{"role": role, "content": content}]
+            encode_id = tokenizer.apply_chat_template(conv)
+            input_id += encode_id
+            if role in ["user", "system"]:
+                target += [IGNORE_INDEX] * len(encode_id)
+            else:
+                target_mask = encode_id.copy()
+                target_mask[:3] = [IGNORE_INDEX] * 3
+                target += target_mask
+
+        assert len(input_id) == len(target), f"{len(input_id)} != {len(target)}"
+        input_ids.append(input_id)
+        targets.append(target)
+
+    input_ids = torch.tensor(input_ids, dtype=torch.long)
+    targets = torch.tensor(targets, dtype=torch.long)
+
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+
+
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer, data_args):
+        super(LazySupervisedDataset, self).__init__()
+
+        dataset = data_args.dataset_use.split(",")
+        dataset_list = data_list(dataset)
+        rank0_print(f"Loading datasets: {dataset_list}")
+        self.video_max_total_pixels = getattr(data_args, "video_max_total_pixels", 1664 * 28 * 28)
+        self.video_min_total_pixels = getattr(data_args, "video_min_total_pixels", 256 * 28 * 28)
+        self.model_type = data_args.model_type
+        if data_args.model_type == "qwen2.5vl":
+            self.get_rope_index = get_rope_index_25
+        else:
+            self.get_rope_index = get_rope_index_2
+
+        list_data_dict = []
+
+        for data in dataset_list:
+            file_format = data["annotation_path"].split(".")[-1]
+            if file_format == "jsonl":
+                annotations = read_jsonl(data["annotation_path"])
+            else:
+                annotations = json.load(open(data["annotation_path"], "r"))
+            sampling_rate = data.get("sampling_rate", 1.0)
+            if sampling_rate < 1.0:
+                annotations = random.sample(annotations, int(len(annotations) * sampling_rate))
+                print(f"sampling {len(annotations)} examples from dataset {data}")
+            else:
+                rank0_print(f"dataset name: {data}")
+            for ann in annotations:
+                if data["data_path"] != "":
+                    ann["data_path"] = data["data_path"]
+                elif "raw_data" in ann.keys():
+                    ann["data_path"] = ann["raw_data"]["data_root"]
+            list_data_dict += annotations
+
+        list_data_dict = self.pre_filter_long_case(list_data_dict, max_words=tokenizer.max_len_single_sentence)
+        random.shuffle(list_data_dict)  # Randomly shuffle the data for training
+
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+
+        rank0_print(f"Total training samples: {len(self.list_data_dict)}")
+        rank0_print("Formatting inputs...Skip in lazy mode")
+
+        # self.data_args.image_processor.max_pixels = data_args.max_pixels
+        # self.data_args.image_processor.min_pixels = data_args.min_pixels
+        # self.data_args.image_processor.size["longest_edge"] = data_args.max_pixels
+        # self.data_args.image_processor.size["shortest_edge"] = data_args.min_pixels
+
+    def __len__(self):
+        return len(self.list_data_dict)
+
+    def pre_filter_long_case(self, list_data_dict, max_words=1024):
+        """filter out conversations with total words exceeding max_words"""
+
+        def count_total_words(convs):
+            total = 0
+            for entry in convs:
+                value = entry.get("value", "")
+                total += len(value.strip().split())
+            return total
+
+        return [item for item in list_data_dict if count_total_words(item.get("conversations", [])) <= max_words]
+
+    @property
+    def lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            img_tokens = 128 if "image" in sample else 0
+            length_list.append(sum(len(conv["value"].split()) for conv in sample["conversations"]) + img_tokens)
+        return length_list
+
+    @property
+    def modality_lengths(self):
+        length_list = []
+        for sample in self.list_data_dict:
+            cur_len = sum(len(conv["value"].split()) for conv in sample["conversations"])
+            cur_len = cur_len if ("images" in sample) or ("videos" in sample) else -cur_len
+            length_list.append(cur_len)
+        return length_list
+
+    @property
+    def pre_calculated_length(self):
+        if "num_tokens" in self.list_data_dict[0]:
+            length_list = [sample["num_tokens"] for sample in self.list_data_dict]
+            return np.array(length_list)
+        else:
+            print("No pre-calculated length available.")
+            return np.array([1] * len(self.list_data_dict))
+
+    def process_image_unified(self, image_file):
+        processor = copy.deepcopy(self.data_args.image_processor)
+        image = Image.open(image_file).convert("RGB")
+        # if fix image size?
+        if getattr(self.data_args, "fix_image_size", None) is not None:
+            image = image.resize(
+                self.data_args.fix_image_size,
+                resample=Image.BICUBIC,
+            )
+        visual_processed = processor.preprocess(image, return_tensors="pt")
+        image_tensor = visual_processed["pixel_values"]
+        if isinstance(image_tensor, List):
+            image_tensor = image_tensor[0]
+        grid_thw = visual_processed["image_grid_thw"][0]
+        return image_tensor, grid_thw
+
+    def process_video(self, video_file):
+        if not os.path.exists(video_file):
+            print(f"File not exist: {video_file}")
+        vr = VideoReader(video_file, num_threads=4)
+        total_frames = len(vr)
+        avg_fps = vr.get_avg_fps()
+        video_length = total_frames / avg_fps
+        interval = getattr(self.data_args, "base_interval", 4)
+
+        num_frames_to_sample = round(video_length / interval)
+        video_min_frames = getattr(self.data_args, "video_min_frames", 4)
+        video_max_frames = getattr(self.data_args, "video_max_frames", 8)
+
+        target_frames = min(max(num_frames_to_sample, video_min_frames), video_max_frames)
+        frame_idx = np.linspace(0, total_frames - 1, target_frames, dtype=int)
+        frame_idx = np.unique(frame_idx)
+        video = vr.get_batch(frame_idx).asnumpy()
+        fps = len(frame_idx) / video_length
+        processor = copy.deepcopy(self.data_args.image_processor)
+        processor.max_pixels = self.data_args.video_max_frame_pixels
+        processor.min_pixels = self.data_args.video_min_frame_pixels
+        processor.size["longest_edge"] = processor.max_pixels
+        processor.size["shortest_edge"] = processor.min_pixels
+        video_processed = processor.preprocess(images=None, videos=video, return_tensors="pt")
+        video_tensor = video_processed["pixel_values_videos"]
+        grid_thw = video_processed["video_grid_thw"][0]
+        second_per_grid_ts = [self.data_args.image_processor.temporal_patch_size / fps] * len(grid_thw)
+        return video_tensor, grid_thw, second_per_grid_ts
+
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        num_base_retries = 3
+        num_final_retries = 30
+
+        # try the current sample first
+        for attempt_idx in range(num_base_retries):
+            try:
+                sample = self._get_item(i)
+                return sample
+            except Exception as e:
+                # sleep 1s in case it is a cloud disk issue
+                print(f"[Try #{attempt_idx}] Failed to fetch sample {i}. Exception:", e)
+                time.sleep(1)
+
+        # try other samples, in case it is file corruption issue
+        for attempt_idx in range(num_base_retries):
+            try:
+                next_index = min(i + 1, len(self.list_data_dict) - 1)
+                # sample_idx = random.choice(range(len(self)))
+                sample = self._get_item(next_index)
+                return sample
+            except Exception as e:
+                # no need to sleep
+                print(
+                    f"[Try other #{attempt_idx}] Failed to fetch sample {next_index}. Exception:",
+                    e,
+                )
+                pass
+
+        try:
+            sample = self._get_item(i)
+            return sample
+        except Exception as e:
+            raise e
+
+    def _get_item(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        video = None
+        if "images" in sources[0] and len(sources[0]["images"]):
+            image_folder = self.list_data_dict[i]["data_path"]
+            image_file = self.list_data_dict[i]["images"]
+            if isinstance(image_file, List):
+                if len(image_file) > 1:
+                    image_file = [os.path.join(image_folder, file) for file in image_file]
+                    results = [self.process_image_unified(file) for file in image_file]
+                    image, grid_thw = zip(*results)
+                else:
+                    image_file = image_file[0]
+                    image_file = os.path.join(image_folder, image_file)
+                    image, grid_thw = self.process_image_unified(image_file)
+                    image = [image]
+            else:
+                image_file = os.path.join(image_folder, image_file)
+                image, grid_thw = self.process_image_unified(image_file)
+                image = [image]
+            grid_thw_merged = copy.deepcopy(grid_thw)
+            if not isinstance(grid_thw, Sequence):
+                grid_thw_merged = [grid_thw_merged]
+                grid_thw = [grid_thw]
+            grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+            ]
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged, visual_type="image")
+            position_ids, _ = self.get_rope_index(
+                self.data_args.image_processor.merge_size,
+                data_dict["input_ids"],
+                torch.stack(grid_thw, dim=0),  # (1,16,16)
+            )
+        elif "videos" in sources[0] and len(sources[0]["videos"]):
+            video_file = self.list_data_dict[i]["videos"]
+            video_folder = self.list_data_dict[i]["data_path"]
+            if isinstance(video_file, List):
+                if len(video_file) > 1:
+                    video_file = [os.path.join(video_folder, file) for file in video_file]
+                    results = [self.process_video(file) for file in video_file]
+                    video, grid_thw, second_per_grid_ts = zip(*results)
+                else:
+                    video_file = video_file[0]
+                    video_file = os.path.join(video_folder, video_file)
+                    video, grid_thw, second_per_grid_ts = self.process_video(video_file)
+                    video = [video]
+            else:
+                video_file = os.path.join(video_folder, video_file)
+                video, grid_thw, second_per_grid_ts = self.process_video(video_file)
+                video = [video]
+            grid_thw_merged = copy.deepcopy(grid_thw)
+            if not isinstance(grid_thw, Sequence):
+                grid_thw_merged = [grid_thw_merged]
+                grid_thw = [grid_thw]
+            grid_thw_merged = [
+                merged_thw.prod() // self.data_args.image_processor.merge_size**2 for merged_thw in grid_thw_merged
+            ]
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged, visual_type="video")
+            position_ids, _ = self.get_rope_index(
+                self.data_args.image_processor.merge_size,
+                data_dict["input_ids"],
+                video_grid_thw=torch.stack(grid_thw, dim=0),
+                second_per_grid_ts=second_per_grid_ts,
+            )
+        else:
+            grid_thw_merged = None
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess_qwen_2_visual(sources, self.tokenizer, grid_thw=grid_thw_merged)
+            position_ids = torch.arange(0, data_dict["input_ids"].size(1)).view(1, -1).unsqueeze(0).expand(3, -1, -1)
+
+        if isinstance(i, int):
+            data_dict = dict(
+                input_ids=data_dict["input_ids"][0],
+                labels=data_dict["labels"][0],
+                position_ids=position_ids,
+            )
+        if "images" in self.list_data_dict[i]:
+            data_dict["pixel_values"] = image
+            data_dict["image_grid_thw"] = grid_thw
+        # video exist in the data
+        elif "videos" in self.list_data_dict[i]:
+            data_dict["pixel_values_videos"] = video
+            data_dict["video_grid_thw"] = grid_thw
+
+        max_len = self.tokenizer.max_len_single_sentence
+        if data_dict["input_ids"].shape[0] > max_len:
+            data_dict["input_ids"] = data_dict["input_ids"][:max_len]
+            data_dict["labels"] = data_dict["labels"][:max_len]
+            data_dict["position_ids"] = position_ids[:, :, :max_len]
+
+        return data_dict
+
+
+def pad_and_cat(tensor_list):
+    max_length = max(tensor.shape[2] for tensor in tensor_list)
+
+    padded_tensors = []
+    for tensor in tensor_list:
+        pad_length = max_length - tensor.shape[2]
+        padded_tensor = torch.nn.functional.pad(tensor, (0, pad_length), "constant", 1)
+        padded_tensors.append(padded_tensor)
+
+    stacked_tensor = torch.cat(padded_tensors, dim=1)
+
+    return stacked_tensor
+
+
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels", "position_ids")
+        )
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id,
+            padding_side=self.tokenizer.padding_side,
+        )
+        labels = torch.nn.utils.rnn.pad_sequence(
+            labels, batch_first=True, padding_value=IGNORE_INDEX, padding_side=self.tokenizer.padding_side
+        )
+        position_ids = pad_and_cat(position_ids)
+
+        input_ids = input_ids[:, : self.tokenizer.model_max_length]
+        labels = labels[:, : self.tokenizer.model_max_length]
+        position_ids = position_ids[..., : self.tokenizer.model_max_length]  # 3,bs,length
+
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        images = list(
+            itertools.chain(*(instance["pixel_values"] for instance in instances if "pixel_values" in instance))
+        )
+        videos = list(
+            itertools.chain(
+                *(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+            )
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = list(
+                itertools.chain(*(instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance))
+            )
+            grid_thw = torch.stack(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = list(
+                itertools.chain(*(instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance))
+            )
+            video_grid_thw = torch.stack(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+        batch["position_ids"] = position_ids
+        return batch
+
+
+@dataclass
+class FlattenedDataCollatorForSupervisedDataset(DataCollatorForSupervisedDataset):
+    """Collate examples into packed sequence with multi-modal support."""
+
+    tokenizer: transformers.PreTrainedTokenizer
+
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels, position_ids = tuple(
+            [instance[key] for instance in instances] for key in ("input_ids", "labels", "position_ids")
+        )
+
+        seq_lens = torch.tensor([0] + [len(seq) for seq in input_ids], dtype=torch.int32)
+        cumsum_seq_lens = torch.cumsum(seq_lens, dim=0, dtype=torch.int32)
+        input_ids = torch.cat(input_ids, dim=0)
+        labels = torch.cat(labels, dim=0)
+        position_ids = torch.cat(position_ids, dim=2)
+
+        batch = dict(
+            input_ids=input_ids.unsqueeze(0),
+            labels=labels.unsqueeze(0),
+            attention_mask=cumsum_seq_lens,
+            position_ids=position_ids,
+        )
+        images = list(
+            itertools.chain(*(instance["pixel_values"] for instance in instances if "pixel_values" in instance))
+        )
+        videos = list(
+            itertools.chain(
+                *(instance["pixel_values_videos"] for instance in instances if "pixel_values_videos" in instance)
+            )
+        )
+        if len(images) != 0:
+            concat_images = torch.cat([image for image in images], dim=0)
+            grid_thw = list(
+                itertools.chain(*(instance["image_grid_thw"] for instance in instances if "image_grid_thw" in instance))
+            )
+            grid_thw = torch.stack(grid_thw, dim=0)
+        else:
+            concat_images = None
+            grid_thw = None
+
+        if len(videos) != 0:
+            concat_videos = torch.cat([video for video in videos], dim=0)
+            video_grid_thw = list(
+                itertools.chain(*(instance["video_grid_thw"] for instance in instances if "video_grid_thw" in instance))
+            )
+            video_grid_thw = torch.stack(video_grid_thw, dim=0)
+        else:
+            concat_videos = None
+            video_grid_thw = None
+
+        batch["pixel_values"] = concat_images
+        batch["image_grid_thw"] = grid_thw
+        batch["pixel_values_videos"] = concat_videos
+        batch["video_grid_thw"] = video_grid_thw
+
+        return batch
+
+
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer, data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    # load training dataset
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+
+    # load evaluation dataset (if specified eval dataset path)
+    eval_dataset = None
+    if hasattr(data_args, "eval_dataset") and data_args.eval_dataset:
+        eval_data_args = copy.deepcopy(data_args)
+        eval_data_args.dataset_use = data_args.eval_dataset
+        eval_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=eval_data_args)
+
+    # select appropriate collator based on whether data needs to be flattened
+    if data_args.data_flatten:
+        data_collator = FlattenedDataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    else:
+        data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+
+    return dict(
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=data_collator,
+    )
+
+
+def make_vlm_dataloader(cfg):
+    data_args = cfg.datasets.vlm_data
+    image_processor = AutoProcessor.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+    ).image_processor
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+        model_max_length=data_args.model_max_length,
+        padding_side="left",  # flash Attention version of Qwen2.5_VL. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input.
+        use_fast=False,
+    )
+
+    # avoid processing these in dataset
+    image_processor.max_pixels = int(data_args.max_pixels)
+    image_processor.min_pixels = int(data_args.min_pixels)
+    image_processor.size["longest_edge"] = int(data_args.max_pixels)
+    image_processor.size["shortest_edge"] = int(data_args.min_pixels)
+    data_args_ns = SimpleNamespace(**OmegaConf.to_container(data_args, resolve=True))
+    data_args_ns.image_processor = image_processor  # TODO later remove the logic bound to model
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args_ns)
+
+    #
+    train_dataset = data_module["train_dataset"]
+    data_collator = data_module["data_collator"]
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.datasets.vlm_data.per_device_batch_size,
+        collate_fn=data_collator,
+        num_workers=4,
+    )
+
+    return {
+        "train_dataloader": train_dataloader,
+    }
+
+
+from transformers import AutoTokenizer, AutoProcessor
+
+if __name__ == "__main__":
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./examples/LIBERO/train_files/starvla_cotrain_libero.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    
+    data_args = cfg.datasets.vlm_data
+    image_processor = AutoProcessor.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+    ).image_processor
+
+    tokenizer = transformers.AutoTokenizer.from_pretrained(
+        cfg.framework.qwenvl.base_vlm,
+        model_max_length=data_args.model_max_length,
+        padding_side="left",
+        use_fast=False,
+    )
+
+    # avoid processing these in dataset
+    image_processor.max_pixels = data_args.max_pixels
+    image_processor.min_pixels = data_args.min_pixels
+    image_processor.size["longest_edge"] = data_args.max_pixels
+    image_processor.size["shortest_edge"] = data_args.min_pixels
+
+    data_args_ns = SimpleNamespace(**OmegaConf.to_container(data_args, resolve=True))
+    data_args_ns.image_processor = image_processor
+    data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args_ns)
+
+    #
+    train_dataset = data_module["train_dataset"]
+    data_collator = data_module["data_collator"]
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.datasets.vlm_data.per_device_batch_size,
+        collate_fn=data_collator,
+    )
+    batchs = iter(train_dataloader)
+    batch_samples = next(batchs)
+    # skip the first 99 batches, get the 100th batch
+    from itertools import islice
+
+    # batch_samples = next(islice(batchs, 99, 100))
+    count = 0
+    while count < 100:
+        batch_samples = next(batchs)  # for debug
+        print(count)
+        count += 1
+    pass
diff --git a/code/model/__pycache__/tools.cpython-310.pyc b/code/model/__pycache__/tools.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9396fd76ec3f91526ab5b205b0a6cb5f18787d54
Binary files /dev/null and b/code/model/__pycache__/tools.cpython-310.pyc differ
diff --git a/code/model/__pycache__/tools.cpython-311.pyc b/code/model/__pycache__/tools.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df4481b48260e0f0600254e96bab49b574e54e21
Binary files /dev/null and b/code/model/__pycache__/tools.cpython-311.pyc differ
diff --git a/code/model/framework/LangForce.py b/code/model/framework/LangForce.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b91bcc51161e13ebfe281b6057c6dba93d77d0
--- /dev/null
+++ b/code/model/framework/LangForce.py
@@ -0,0 +1,641 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Junqiu YU / Fudan University] in [2025].
+# Design and Merged by [Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GR00T Framework
+Qwen-VL + Flow-matching head to directly predict continuous actions
+
+LangForceV5:
+(1) Assert language span consistency between prior/post branches (token-level exact match)
+(2) Hard-token LLR + Shortcut gate
+(3) Optional detach of prior condition to avoid pushing backbone to vision-only shortcut
+"""
+import sys
+from pathlib import Path
+
+# Add workspace root to Python path if not already there
+_workspace_root = Path(__file__).parent.parent.parent.parent
+if str(_workspace_root) not in sys.path:
+    sys.path.insert(0, str(_workspace_root))
+
+from typing import List, Optional, Tuple, Set
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+# ===== Qwen special tokens (you confirmed) =====
+VISION_START_TOKEN_INDEX = 151652  # <|vision_start|>
+VISION_END_TOKEN_INDEX   = 151654  # <|vision_end|>
+IMAGE_TOKEN_INDEX        = 151655  # <|image_pad|>
+VIDEO_TOKEN_INDEX        = 151656  # <|video_pad|>
+IM_START_TOKEN_INDEX     = 151644  # <|im_start|>
+IM_END_TOKEN_INDEX       = 151645  # <|im_end|>
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.GR00T_ActionHeader import get_action_model, FlowmatchingActionHead
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+@FRAMEWORK_REGISTRY.register("LangForce")
+class LangForce(baseframework):
+    """
+    LangForce: Bayesian Decomposition of Vision Language Action Models via Latent Action Queries (arxiv 2601.15197) 
+    
+    Dual-branch VLA with:
+      - Prior branch: (V + A + L) => proposal-like p(a|v) head
+      - Posterior branch: (V + L + A) => pi(a|v,l)
+      - LLR regularizer: maximize log p(L|V,A_prior) - sg(log p(L|V))
+        with:
+          * Hard-token LLR (top-k hardest tokens under post)
+          * Shortcut gate (down-weight LLR when log p(L|V) is already very low)
+      - Optional detach prior cond (protect backbone from vision-only drift)
+
+    Additionally:
+      - Training-time assertion: extracted language spans in prior/post must match exactly (token-level).
+        If mismatch => raise AssertionError with decoded spans.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        # align dims --> should go into config ideally
+        self.config.framework.action_model.diffusion_model_cfg.cross_attention_dim = (
+            self.qwen_vl_interface.model.config.hidden_size
+        )
+
+        self.num_latent_action_query = self.config.framework.qwenvl.get("num_latent_action_query", 32)
+        self.latent_action_query = "".join([f"<|action_{i}|>" for i in range(self.num_latent_action_query)])
+        self.action_token_ids = None  # cached {'first','last'}
+
+        self.action_model: FlowmatchingActionHead = get_action_model(config=self.config)
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+
+        # ===== Loss weights =====
+        self.kl_weight = float(self.config.framework.get("kl_weight", 0.1))  # maximize LLR via -kl_weight * kl_loss
+        self.prior_loss_weight = float(self.config.framework.get("prior_loss_weight", 0.3))
+
+        # ===== (0) training assert switch =====
+        self.assert_lang_span_match = bool(self.config.framework.get("assert_lang_span_match", True))
+
+        # ===== (1) detach prior cond switch =====
+        self.detach_prior_cond = bool(self.config.framework.get("detach_prior_cond", True))
+
+        # ===== (2) Hard-token LLR =====
+        self.use_hard_token_llr = bool(self.config.framework.get("use_hard_token_llr", True))
+        self.hard_token_k = int(self.config.framework.get("hard_token_k", 16))
+        assert self.hard_token_k > 0
+
+        # ===== (3) Shortcut gate =====
+        # gate computed from posterior language-span NLL: high NLL => log p(L|V) low => gate small
+        self.use_kl_gate = bool(self.config.framework.get("use_kl_gate", True))
+        self.kl_gate_momentum = float(self.config.framework.get("kl_gate_momentum", 0.99))
+        self.kl_gate_temp = float(self.config.framework.get("kl_gate_temp", 0.5))
+        self.kl_gate_tau_scale = float(self.config.framework.get("kl_gate_tau_scale", 0.7))  # scale EMA threshold
+        self.kl_gate_min = float(self.config.framework.get("kl_gate_min", 0.0))
+        self.kl_gate_max = float(self.config.framework.get("kl_gate_max", 1.0))
+
+        # cache some special token ids from tokenizer lazily
+        self._im_end_id = None
+
+        # EMA buffer for posterior language-span NLL
+        self.register_buffer("post_nll_ema", torch.tensor(0.0, dtype=torch.float32))
+        self.register_buffer("post_nll_ema_inited", torch.tensor(0, dtype=torch.uint8))
+
+    # ---------------------------------------------------------------------
+    # Token id helpers
+    # ---------------------------------------------------------------------
+    def _ensure_action_token_ids(self, tokenizer):
+        if self.action_token_ids is None:
+            self.action_token_ids = {
+                "first": tokenizer.convert_tokens_to_ids("<|action_0|>"),
+                "last": tokenizer.convert_tokens_to_ids(f"<|action_{self.num_latent_action_query-1}|>"),
+            }
+
+    def _ensure_im_end_id(self, tokenizer):
+        if self._im_end_id is None:
+            self._im_end_id = tokenizer.convert_tokens_to_ids("<|im_end|>")
+
+    def _find_last_pos(self, seq_1d: torch.Tensor, token_id: int) -> int:
+        idx = (seq_1d == int(token_id)).nonzero(as_tuple=True)[0]
+        if idx.numel() == 0:
+            return -1
+        return int(idx[-1].item())
+
+    def _find_first_pos_after(self, seq_1d: torch.Tensor, token_id: int, start: int) -> int:
+        if start < 0:
+            start = 0
+        sub = seq_1d[start:]
+        idx = (sub == int(token_id)).nonzero(as_tuple=True)[0]
+        if idx.numel() == 0:
+            return -1
+        return int(start + idx[0].item())
+
+    # ---------------------------------------------------------------------
+    # Action block helpers
+    # ---------------------------------------------------------------------
+    def _get_action_block_start(self, input_ids_1d: torch.Tensor, tokenizer) -> int:
+        self._ensure_action_token_ids(tokenizer)
+        first_id = self.action_token_ids["first"]
+        last_id = self.action_token_ids["last"]
+
+        pos = (input_ids_1d == int(first_id)).nonzero(as_tuple=True)[0]
+        if pos.numel() == 0:
+            return -1
+
+        start = int(pos[0].item())
+        end = start + self.num_latent_action_query
+        if end > input_ids_1d.shape[0]:
+            return -1
+        if int(input_ids_1d[end - 1].item()) != int(last_id):
+            return -1
+        return start
+
+    def _extract_action_query_hidden_states(
+        self,
+        hidden_states: torch.Tensor,   # [B, S, H]
+        input_ids: torch.Tensor,       # [B, S]
+        tokenizer,
+        return_starts: bool = False,
+    ):
+        self._ensure_action_token_ids(tokenizer)
+
+        B = hidden_states.shape[0]
+        out = []
+        starts = []
+        for b in range(B):
+            start = self._get_action_block_start(input_ids[b], tokenizer)
+            assert start != -1, "No valid contiguous action token block found in the sequence."
+            end = start + self.num_latent_action_query
+            out.append(hidden_states[b, start:end, :])
+            starts.append(start)
+
+        out = torch.stack(out, dim=0)  # [B, K, H]
+        if return_starts:
+            return out, torch.tensor(starts, device=input_ids.device, dtype=torch.long)
+        return out
+
+    # ---------------------------------------------------------------------
+    # SHIFT-correct token-level NLL span
+    # ---------------------------------------------------------------------
+    def _token_nll_span(
+        self,
+        logits_1d: torch.Tensor,      # [S, V]
+        input_ids_1d: torch.Tensor,   # [S]
+        start: int,
+        end: int,
+        ignore_ids: Optional[Set[int]] = None,
+    ):
+        """
+        Return (nll_vec, target_ids_vec) for tokens in [start,end),
+        using next-token alignment:
+          token at position j is scored by logits[j-1] (requires j>0).
+        """
+        if end <= start:
+            return None, None
+        S = int(input_ids_1d.shape[0])
+        start = max(0, int(start))
+        end = min(S, int(end))
+        if end <= start:
+            return None, None
+
+        j = torch.arange(start, end, device=input_ids_1d.device, dtype=torch.long)
+        j = j[j > 0]
+        if j.numel() == 0:
+            return None, None
+
+        targets = input_ids_1d[j].long()
+
+        if ignore_ids is not None and len(ignore_ids) > 0:
+            keep = torch.ones_like(targets, dtype=torch.bool)
+            for tid in ignore_ids:
+                keep &= (targets != int(tid))
+            j = j[keep]
+            if j.numel() == 0:
+                return None, None
+            targets = input_ids_1d[j].long()
+
+        pred_pos = j - 1
+        pred_logits = logits_1d[pred_pos].float()  # [T, V]
+        nll = F.cross_entropy(pred_logits, targets, reduction="none")  # [T]
+        return nll, targets
+
+    # ---------------------------------------------------------------------
+    # Compute LLR with:
+    #   - strict span equality assertion (training)
+    #   - hard-token LLR (top-k)
+    #   - shortcut gate based on posterior NLL
+    # ---------------------------------------------------------------------
+    def _compute_language_llr_from_boundaries(
+        self,
+        priori_logits: torch.Tensor,            # [B, S, V]
+        posteriori_logits: torch.Tensor,        # [B, S, V] (detached)
+        priori_input_ids: torch.Tensor,         # [B, S]
+        posteriori_input_ids: torch.Tensor,     # [B, S]
+        priori_action_starts: torch.Tensor,     # [B]
+        posteriori_action_starts: torch.Tensor, # [B]
+    ) -> torch.Tensor:
+        tokenizer = self.qwen_vl_interface.processor.tokenizer
+        self._ensure_im_end_id(tokenizer)
+
+        pad_id = tokenizer.pad_token_id
+        ignore_ids: Set[int] = set()
+        if pad_id is not None:
+            ignore_ids.add(int(pad_id))
+        ignore_ids.add(int(IMAGE_TOKEN_INDEX))
+        ignore_ids.add(int(VIDEO_TOKEN_INDEX))
+        ignore_ids.add(int(VISION_START_TOKEN_INDEX))
+        ignore_ids.add(int(VISION_END_TOKEN_INDEX))
+        ignore_ids.add(int(IM_START_TOKEN_INDEX))
+        ignore_ids.add(int(IM_END_TOKEN_INDEX))
+
+        B = int(priori_input_ids.shape[0])
+        K = self.num_latent_action_query
+
+        llr_vals = []
+        post_nll_means = []
+
+        for b in range(B):
+            ids_prior = priori_input_ids[b]
+            ids_post  = posteriori_input_ids[b]
+
+            a_start_prior = int(priori_action_starts[b].item())
+            a_start_post  = int(posteriori_action_starts[b].item())
+
+            # ===== prior language span: [action_end : im_end) =====
+            lang_start_prior = a_start_prior + K
+            if lang_start_prior >= ids_prior.shape[0]:
+                continue
+            im_end = self._find_first_pos_after(ids_prior, self._im_end_id, lang_start_prior)
+            lang_end_prior = im_end if im_end != -1 else int(ids_prior.shape[0])
+            if lang_end_prior <= lang_start_prior:
+                continue
+
+            # ===== post language span: [last(vision_end)+1 : action_start) =====
+            v_end_post = self._find_last_pos(ids_post, VISION_END_TOKEN_INDEX)
+            if v_end_post == -1:
+                continue
+            lang_start_post = v_end_post + 1
+            lang_end_post = a_start_post
+            if lang_end_post <= lang_start_post:
+                continue
+
+            # ===== (1) strict assertion: token-level equality =====
+            if self.training and self.assert_lang_span_match:
+                prior_span_ids = ids_prior[lang_start_prior:lang_end_prior]
+                post_span_ids  = ids_post[lang_start_post:lang_end_post]
+
+                if (prior_span_ids.numel() != post_span_ids.numel()) or (not torch.equal(prior_span_ids, post_span_ids)):
+                    # decode for human-readable debugging
+                    prior_text = tokenizer.decode(prior_span_ids.tolist())
+                    post_text  = tokenizer.decode(post_span_ids.tolist())
+
+                    raise AssertionError(
+                        "\n[LangForceV5] Language span mismatch detected!\n"
+                        f"Sample b={b}\n"
+                        f"PRIOR span idx: [{lang_start_prior}:{lang_end_prior}]  (len={prior_span_ids.numel()})\n"
+                        f"POST  span idx: [{lang_start_post}:{lang_end_post}]  (len={post_span_ids.numel()})\n"
+                        f"PRIOR span: {repr(prior_text)}\n"
+                        f"POST  span: {repr(post_text)}\n"
+                        f"PRIOR token ids (first 50): {prior_span_ids[:50].tolist()}\n"
+                        f"POST  token ids (first 50): {post_span_ids[:50].tolist()}\n"
+                        "This indicates your boundary-based language extraction is inconsistent (likely prompt/template issue)."
+                    )
+
+            # ===== (2) hard-token LLR needs token-level aligned targets =====
+            nll_prior, tok_prior = self._token_nll_span(
+                logits_1d=priori_logits[b],
+                input_ids_1d=ids_prior,
+                start=lang_start_prior,
+                end=lang_end_prior,
+                ignore_ids=ignore_ids,
+            )
+            nll_post, tok_post = self._token_nll_span(
+                logits_1d=posteriori_logits[b],
+                input_ids_1d=ids_post,
+                start=lang_start_post,
+                end=lang_end_post,
+                ignore_ids=ignore_ids,
+            )
+            if nll_prior is None or nll_post is None:
+                continue
+
+            # record post nll mean for gate
+            post_nll_mean = nll_post.mean().detach()
+            post_nll_means.append(post_nll_mean)
+
+            # logp_prior - logp_post = (-nll_prior) - (-nll_post) = nll_post - nll_prior
+            if self.use_hard_token_llr:
+                # require same target token sequence
+                if tok_prior is None or tok_post is None or tok_prior.shape != tok_post.shape or (not torch.equal(tok_prior, tok_post)):
+                    # This should not happen if your spans match, but keep safe fallback.
+                    llr = (nll_post.mean() - nll_prior.mean())
+                else:
+                    k = min(self.hard_token_k, int(nll_post.numel()))
+                    if k <= 0:
+                        continue
+                    idx = torch.topk(nll_post.detach(), k=k, largest=True).indices
+                    llr = (nll_post[idx] - nll_prior[idx]).mean()
+            else:
+                llr = (nll_post.mean() - nll_prior.mean())
+
+            llr_vals.append(llr)
+
+        if len(llr_vals) == 0:
+            return torch.tensor(0.0, device=priori_logits.device, dtype=torch.float32)
+
+        llr_vals_t = torch.stack(llr_vals).float()                 # [M]
+        post_nll_means_t = torch.stack(post_nll_means).float()     # [M]
+
+        # ===== (2) shortcut gate: update EMA threshold =====
+        if self.use_kl_gate and self.training:
+            batch_mean = post_nll_means_t.mean().detach()
+            with torch.no_grad():
+                if int(self.post_nll_ema_inited.item()) == 0:
+                    self.post_nll_ema.copy_(batch_mean)
+                    self.post_nll_ema_inited.fill_(1)
+                else:
+                    m = self.kl_gate_momentum
+                    self.post_nll_ema.copy_(m * self.post_nll_ema + (1.0 - m) * batch_mean)
+
+        # ===== gate computation =====
+        if self.use_kl_gate:
+            tau = (self.post_nll_ema.detach() * float(self.kl_gate_tau_scale))
+            temp = max(float(self.kl_gate_temp), 1e-6)
+            # high nll => log p(L|V) low => gate small
+            g = torch.sigmoid((tau - post_nll_means_t) / temp)
+            # optional clamp/scale
+            if self.kl_gate_min != 0.0 or self.kl_gate_max != 1.0:
+                g = float(self.kl_gate_min) + (float(self.kl_gate_max) - float(self.kl_gate_min)) * g
+        else:
+            g = torch.ones_like(post_nll_means_t)
+
+        # weighted LLR
+        return (g * llr_vals_t).mean()
+
+    # ---------------------------------------------------------------------
+    # Forward
+    # ---------------------------------------------------------------------
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> dict:
+        batch_images = [example["image"] for example in examples]  # [B, [PIL...]]
+        instructions_priori = [self.latent_action_query + example["lang"] for example in examples]       # A + L
+        instructions_posteriori = [example["lang"] + self.latent_action_query for example in examples]  # L + A
+
+        actions = [example["action"] for example in examples]
+        state = [example["state"] for example in examples] if "state" in examples[0] else None
+
+        # ===== Step 1: Priori Branch (V + A + L) =====
+        qwen_inputs_priori = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions_priori
+        )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs_priori = self.qwen_vl_interface(
+                **qwen_inputs_priori,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+                use_cache=False,
+            )
+            priori_last_hidden = qwenvl_outputs_priori.hidden_states[-1]  # [B, S, H]
+            priori_action_hidden, priori_action_starts = self._extract_action_query_hidden_states(
+                priori_last_hidden,
+                qwen_inputs_priori["input_ids"],
+                self.qwen_vl_interface.processor.tokenizer,
+                return_starts=True
+            )  # [B, K, H], [B]
+            priori_logits = qwenvl_outputs_priori.logits  # [B, S, V]
+
+        # ===== Step 2: Posteriori Branch (V + L + A) =====
+        qwen_inputs_posteriori = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions_posteriori
+        )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs_posteriori = self.qwen_vl_interface(
+                **qwen_inputs_posteriori,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+                use_cache=False,
+            )
+            posteriori_last_hidden = qwenvl_outputs_posteriori.hidden_states[-1]  # [B, S, H]
+            posteriori_action_hidden, posteriori_action_starts = self._extract_action_query_hidden_states(
+                posteriori_last_hidden,
+                qwen_inputs_posteriori["input_ids"],
+                self.qwen_vl_interface.processor.tokenizer,
+                return_starts=True
+            )  # [B, K, H], [B]
+
+            # detach baseline logits: do not allow worsening log p(L|V) to inflate LLR
+            posteriori_logits = qwenvl_outputs_posteriori.logits.detach()  # [B, S, V]
+
+        # ===== Step 3: LLR loss (Hard-token + Gate + Assert) =====
+        kl_loss = self._compute_language_llr_from_boundaries(
+            priori_logits=priori_logits,
+            posteriori_logits=posteriori_logits,
+            priori_input_ids=qwen_inputs_priori["input_ids"],
+            posteriori_input_ids=qwen_inputs_posteriori["input_ids"],
+            priori_action_starts=priori_action_starts,
+            posteriori_action_starts=posteriori_action_starts,
+        )
+
+        # ===== Step 4: Action head losses =====
+        with torch.autocast("cuda", dtype=torch.float32):
+            actions_t = torch.tensor(
+                np.array(actions), device=priori_action_hidden.device, dtype=priori_action_hidden.dtype
+            )
+            actions_target = actions_t[:, -(self.future_action_window_size + 1):, :]  # [B, chunk_len, action_dim]
+
+            repeated_diffusion_steps = (
+                self.config.trainer.get("repeated_diffusion_steps", 4) if self.config and self.config.trainer else 4
+            )
+
+            state_tensor = None
+            if state is not None:
+                state_tensor = torch.tensor(
+                    np.array(state), device=priori_action_hidden.device, dtype=priori_action_hidden.dtype
+                )
+
+            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+
+            # (3) detach prior condition switch
+            if self.detach_prior_cond:
+                priori_cond_base = priori_action_hidden.detach()
+            else:
+                priori_cond_base = priori_action_hidden
+
+            priori_cond = priori_cond_base.repeat(repeated_diffusion_steps, 1, 1).float()
+            posteriori_cond = posteriori_action_hidden.repeat(repeated_diffusion_steps, 1, 1).float()
+            state_repeated = state_tensor.repeat(repeated_diffusion_steps, 1, 1) if state_tensor is not None else None
+
+            prior_loss = self.action_model(priori_cond, actions_target_repeated, state_repeated)
+            main_loss = self.action_model(posteriori_cond, actions_target_repeated, state_repeated)
+
+        # ===== Step 5: Total loss (keep your preferred convex mixture) =====
+        total_loss = (
+            (1.0 - self.prior_loss_weight) * main_loss
+            + self.prior_loss_weight * prior_loss
+            - self.kl_weight * kl_loss
+        )
+
+        return {
+            "action_loss": total_loss,
+            # optional logs:
+            "main_loss": main_loss.detach(),
+            "prior_loss": prior_loss.detach(),
+            "kl_loss": kl_loss.detach(),
+        }
+
+    # ---------------------------------------------------------------------
+    # Inference
+    # ---------------------------------------------------------------------
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict],
+        **kwargs: str,
+    ) -> dict:
+        """
+        Inference uses Posteriori branch: (V + L + action_query)
+        """
+        if type(examples) is not list:
+            examples = [examples]
+
+        # robustly preserve PIL for each view
+        batch_images = []
+        for ex in examples:
+            imgs = ex["image"]
+            if isinstance(imgs, list):
+                batch_images.append([to_pil_preserve(im) for im in imgs])
+            else:
+                batch_images.append([to_pil_preserve(imgs)])
+
+        instructions_posteriori = [ex["lang"] + self.latent_action_query for ex in examples]
+        state = [ex["state"] for ex in examples] if "state" in examples[0] else None
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions_posteriori
+        )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+                use_cache=False,
+            )
+
+            last_hidden = qwenvl_outputs.hidden_states[-1]
+            action_hidden = self._extract_action_query_hidden_states(
+                last_hidden,
+                qwen_inputs["input_ids"],
+                self.qwen_vl_interface.processor.tokenizer,
+                return_starts=False
+            )  # [B, K, H]
+
+        state_tensor = None
+        if state is not None:
+            state_tensor = torch.from_numpy(np.array(state)).to(action_hidden.device, dtype=action_hidden.dtype)
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.predict_action(action_hidden, state_tensor)
+
+        return {"normalized_actions": pred_actions.detach().cpu().numpy()}
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    args.config_yaml = "examples/MultiRobot/train_files/starvla_cotrain_multiRobot.yaml"
+    cfg = OmegaConf.load(args.config_yaml)
+
+    model: LangForce = LangForce(cfg)
+    print(model)
+
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16),
+        "image": [image],
+        "lang": "Put all the toys in the child's room ... inside the toy box.",
+    }
+    sample2 = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16),
+        "image": [image],
+        "lang": "Put all the toys in the child's room ... inside the toy box.",
+    }
+
+    batch  = [sample, sample2]
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+
+    out = model(batch)
+    print("Action Loss:", out["action_loss"].item(), "KL Loss:", out["kl_loss"].item())
+
+    pred = model.predict_action([sample])
+    print("Pred shape:", pred["normalized_actions"].shape)
+
+    # optional dataloader test
+    vla_dataset_cfg = cfg.datasets.vla_data
+    from torch.utils.data import DataLoader
+    from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    cfg.datasets.vla_data.include_state = "False"
+    dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1,
+        collate_fn=collate_fn,
+    )
+
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        model(batch)
+        break
+
+    print("Finished")
diff --git a/code/model/framework/M1.py b/code/model/framework/M1.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9df674d447a97384bfadbbf1c420935becd572b
--- /dev/null
+++ b/code/model/framework/M1.py
@@ -0,0 +1,421 @@
+# Copyright 2025 InternVLA-M1. All rights reserved.
+# Modified by [Jinhui YE/ HKUST University] in [2025]. 
+# Modification: [add fake sample and predict_action to match with starVLA].
+"""
+InternVLA M1 framework:
+Vision-Language-Action diffusion model integrating:
+  - Qwen2.5 vision-language backbone
+  - Layer-wise QFormer aggregation
+  - DINO multi-view visual encoder
+  - DiT diffusion head for future action sequence prediction
+Primary goal: predict continuous future actions conditioned on multi-view images + instruction.
+"""
+
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.projector.QFormer import get_layerwise_qformer
+from starVLA.model.modules.action_model.DiTActionHeader import get_action_model
+from starVLA.model.modules.dino_model.dino import get_dino_model
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+
+
+@FRAMEWORK_REGISTRY.register("InternVLA-M1")
+class InternVLA_M1(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        self.layer_qformer = get_layerwise_qformer(config=self.config)
+        self.action_model = get_action_model(config=self.config)
+        self.dino_encoder = get_dino_model(
+            backone_name=getattr(self.config.framework.dino, "dino_backbone", "dinov2_vits14")
+        )
+        self.dino_pro = nn.Linear(
+            in_features=self.dino_encoder.num_channels, out_features=self.qwen_vl_interface.model.config.hidden_size
+        )
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+        Forward pass for training (diffusion objective).
+
+        Flow:
+          1. Build QwenVL inputs (images + instruction tokens)
+          2. Extract hidden states from configured layer range
+          3. Encode images with DINO, flatten multi-view tokens and project
+          4. Concatenate per-layer language tokens with visual tokens
+          5. Fuse via layer-wise QFormer -> action condition embeddings
+          6. Prepare repeated future action windows (for diffusion efficiency)
+          7. Predict noise and compute diffusion loss
+
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+            **kwargs: Reserved.
+
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B， len, 7]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            pass
+
+        # Step 2: DINO Forward
+        image_tensors = self.dino_encoder.prepare_dino_input(batch_images)  #
+        B = len(batch_images)
+        dino_features = self.dino_encoder(image_tensors)  # DINO output is [B*num_view, token, dim]
+        dino_encoded_features = dino_features.reshape(B, -1, dino_features.shape[-1])  # [B, num_view * token, dim]
+        dino_encoded_features = self.dino_pro(dino_encoded_features)  # [B, num_view * token, hidden_size]
+
+        # Step 3: aggregation condition for Action expert
+        start_layer = self.config.framework.layer_qformer.qformer_start_layer
+        end_layer = self.config.framework.layer_qformer.qformer_end_layer
+        condition_features = qwenvl_outputs.hidden_states[start_layer:end_layer]
+
+        cat_conditions = []
+        for layer_index in range(len(condition_features)):
+            layer_features = condition_features[layer_index]  # [B, n_qformer_token, D]
+            layer_features = torch.cat(
+                [layer_features, dino_encoded_features], dim=1
+            )  # [B, n_qformer_token + num_view * token, D]
+            cat_conditions.append(layer_features)
+
+        action_condition = self.layer_qformer(cat_conditions)  # [B, 64, D_action]
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+
+            # here is a tips to accelerate training speed, by repeating each sample for several times @ref to CogACT
+            actions = torch.tensor(np.array(actions), device=action_condition.device)  # [B, chunk, 7]
+            actions_future = actions[:, -(self.future_action_window_size + 1) :, :]
+
+            # tips: Repeat 'actions' 'repeated_diffusion_steps' times, resulting in [repeated_diffusion_steps*B, T, D]
+            repeated_diffusion_steps = (
+                self.config.trainer.get("repeated_diffusion_steps", 4) if self.config and self.config.trainer else 4
+            )
+            actions_repeated = actions_future.repeat(repeated_diffusion_steps, 1, 1)
+            action_condition = action_condition.repeat(
+                repeated_diffusion_steps, 1, 1
+            )  # [repeated_diffusion_steps*B, T, D_action]
+
+            # DiT noise add and predict
+            noise_pred, noise, timestep = self.action_model(actions_repeated, action_condition)
+
+            # perdition loss
+            action_loss = self.action_model.loss(noise_pred, noise)
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        batch_images: List[List[Image.Image]],  # B * List of PIL Image as [view1, view2]
+        instructions: List[str],
+        cfg_scale: float = 1.5,
+        use_ddim: bool = True,
+        num_ddim_steps: int = 5,
+        resize_image = [224, 224],
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        Inference: generate future normalized action sequence via diffusion sampling.
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          3. Extract DINO tokens and project to vlm hidden size
+          4. Build multi-layer fused QwenVL and DINO features via QFormer
+          5. Run diffusion sampling (DDIM optional, CFG optional)
+          6. Return normalized action trajectory
+
+        Args:
+            batch_images: List of samples; each sample is List[PIL.Image] (multi-view).
+            instructions: List[str] natural language task instructions.
+            cfg_scale: >1 enables classifier-free guidance (scales conditional vs unconditional).
+            use_ddim: Whether to use DDIM deterministic sampling.
+            num_ddim_steps: Number of DDIM steps if enabled.
+            **kwargs: Reserved.
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        # align obs and lang # is policy's duty to make sure the image size?
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+        instructions = [instruction.lower() for instruction in instructions]
+
+        inferface_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+        qwen_inputs = inferface_inputs
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+            B = len(batch_images) # dino don't have smart resize in processing
+            image_tensors = self.dino_encoder.prepare_dino_input(batch_images)
+            dino_features = self.dino_encoder(image_tensors)
+
+            B = len(batch_images)
+            dino_encoded_features = dino_features.reshape(B, -1, dino_features.shape[-1])  # [B, num_view * token, dim]
+            dino_encoded_features = self.dino_pro(dino_encoded_features)  # [B, 256, D]
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+
+            start_layer = self.config.framework.layer_qformer.qformer_start_layer
+            end_layer = self.config.framework.layer_qformer.qformer_end_layer
+            condition_features = qwenvl_outputs.hidden_states[start_layer:end_layer]
+            cat_conditions = []
+            for layer_index in range(len(condition_features)):
+                layer_features = condition_features[layer_index]  # [B, n_qformer_token, D]
+                layer_features = torch.cat(
+                    [layer_features, dino_encoded_features], dim=1
+                )  # [B, n_qformer_token + num_view * token, D]
+                cat_conditions.append(layer_features)
+
+            action_condition_feature = self.layer_qformer(cat_conditions)  # [B, 64, D_action]
+
+            using_cfg = cfg_scale > 1.0
+
+            model_dtype = next(self.action_model.net.parameters()).dtype
+            B = action_condition_feature.shape[0]
+
+            # Sample random noise
+            noise = torch.randn(
+                B,
+                self.future_action_window_size + 1,
+                self.action_model.in_channels,
+                device=action_condition_feature.device,
+            ).to(
+                model_dtype
+            )  # [B, T, D]
+
+            # Setup classifier-free guidance:
+            if using_cfg:
+                noise = torch.cat([noise, noise], 0)  # [2,16,7]
+                uncondition = self.action_model.net.z_embedder.uncondition  # [64, 768]
+                uncondition_shape = uncondition.shape
+                uncondition = uncondition.unsqueeze(0)  # [1, 64, D]
+                uncondition = uncondition.expand(
+                    B, uncondition_shape[0], uncondition_shape[1]
+                )  # [B, n_qformer_token, D]
+                z = torch.cat([action_condition_feature, uncondition], 0)  # [2, 64, 768]
+                cfg_scale = cfg_scale
+                model_kwargs = dict(z=z, cfg_scale=cfg_scale)
+                sample_fn = self.action_model.net.forward_with_cfg
+            else:
+                model_kwargs = dict(z=action_condition_feature)
+                sample_fn = self.action_model.net.forward
+
+            # DDIM Sampling
+            if use_ddim and num_ddim_steps is not None:
+                if self.action_model.ddim_diffusion is None:
+                    self.action_model.create_ddim(ddim_step=num_ddim_steps)
+                samples = self.action_model.ddim_diffusion.ddim_sample_loop(
+                    sample_fn,
+                    noise.shape,
+                    noise,
+                    clip_denoised=False,
+                    model_kwargs=model_kwargs,
+                    progress=False,
+                    device=action_condition_feature.device,
+                    eta=0.0,
+                )
+
+            if using_cfg:
+                samples, _ = samples.chunk(2, dim=0)  # Remove null class samples
+            normalized_actions = samples.cpu().numpy()
+            
+            raw_actions = None
+     
+        return {"normalized_actions": normalized_actions}  # [B, T, action_dim]
+
+
+    @torch.inference_mode()
+    def chat_with_M1(
+        self,
+        image: Image.Image,
+        text: str,
+        max_new_tokens: int = 128,
+        device: Optional[str] = "cuda",
+    ) -> List[str]:
+        processor = getattr(self.qwen_vl_interface, "processor", None)
+        model = getattr(self.qwen_vl_interface, "model", None)
+        # if processor is None or model is None:
+        #     raise RuntimeError("qwen_vl_interface 缺少 processor 或 model。")
+
+        messages0 = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "image": image,
+                    },
+                    {"type": "text", "text": text},
+                ],
+            }
+        ]
+
+        messages = [messages0]
+        # text info
+        texts = [processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages]
+        # visual info
+        image_inputs, video_inputs = process_vision_info(messages)
+
+        # tokenizer
+        inputs = processor(
+            text=texts,
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pt",
+        ).to(device)
+
+        model.eval()
+        generated_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
+        generated_ids_trimmed = [out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
+        outputs = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return outputs
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+
+    # try get model
+    model = InternVLA_M1(cfg)
+    print(model)
+
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image, image], # two views
+        "lang": "This is a fake instruction for testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+
+
+
+    # model_path = "./results/Checkpoints/1_need/0906_bestvla_retrain_sota2/checkpoints/steps_50000_pytorch_model.pt"
+    # state_dict = torch.load(model_path, map_location="cpu")
+
+    # model.load_state_dict(state_dict, strict=True)
+
+    # # try forward model
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
diff --git a/code/model/framework/PI0.py b/code/model/framework/PI0.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc2a90027b9078ee9cd25a0f4bcf7afe27e9ac0d
--- /dev/null
+++ b/code/model/framework/PI0.py
@@ -0,0 +1,1038 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+"""
+PI0 Framework
+将 openpi 的 PI0Pytorch 模型封装为 starVLA framework 接口。
+
+支持：
+  - 从 pi0 safetensors checkpoint 加载预训练参数
+  - 与其他 starVLA framework 相同的 __init__ 和 predict_action 接口
+  - 使用 PaliGemma SentencePiece tokenizer 处理语言指令
+  - 将 starVLA 样本格式（PIL 图像列表 + lang 字符串）转换为 PI0 Observation 格式
+
+参考来源：
+  - openpi/src/openpi/models_pytorch/pi0_pytorch.py  (PI0Pytorch)
+  - openpi/src/openpi/policies/policy.py             (Policy.infer)
+"""
+
+import sys
+import logging
+from pathlib import Path
+from typing import List, Optional, Dict, Any
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+from starVLA.training.trainer_utils import initialize_overwatch
+
+logger = initialize_overwatch(__name__)
+
+# ────────────────────────────────────────────────────────────────
+# 常量
+# ────────────────────────────────────────────────────────────────
+# PI0 默认图像分辨率
+_IMAGE_RESOLUTION = (224, 224)
+
+# PI0 默认使用的图像键名（与 openpi 保持一致）
+_DEFAULT_IMAGE_KEYS = ("base_0_rgb", "left_wrist_0_rgb", "right_wrist_0_rgb")
+
+
+# ────────────────────────────────────────────────────────────────
+# 辅助函数
+# ────────────────────────────────────────────────────────────────
+
+def _pil_to_tensor_normalized(img, resolution=(224, 224)) -> torch.Tensor:
+    """
+    将图像转换为 PI0 所需的张量格式。
+
+    输出为 channels-first 格式 [C, H, W]，归一化至 [-1, 1]。
+    PI0 的 preprocess_observation_pytorch 检测到 channels-first（shape[1]==3）时
+    会先转为 [B, H, W, C] 做增广，再转回 [B, C, H, W]，最终送入 SigLIP conv2d
+    （SigLIP 的 patch_embedding 期望 [B, C, H, W] 格式）。
+
+    Args:
+        img: PIL.Image.Image 或 np.ndarray (H, W, 3) uint8。
+             同时兼容来自 eval_libero.py 的 numpy uint8 数组和 PIL Image。
+        resolution: 目标分辨率 (H, W)，默认 (224, 224)。
+
+    Returns:
+        torch.Tensor: shape [C, H, W], dtype float32, 值域 [-1, 1]。
+    """
+    # ── 统一转为 PIL Image ────────────────────────────────────────
+    if isinstance(img, np.ndarray):
+        # eval_libero.py 传入 (H, W, 3) uint8 numpy 数组
+        img = Image.fromarray(img.astype(np.uint8))
+    # ── PIL 预处理 ────────────────────────────────────────────────
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    if img.size != (resolution[1], resolution[0]):
+        img = img.resize((resolution[1], resolution[0]), Image.BILINEAR)
+    arr = np.array(img, dtype=np.float32) / 255.0   # [H, W, C], [0, 1]
+    arr = arr * 2.0 - 1.0                            # [H, W, C], [-1, 1]
+    t = torch.from_numpy(arr)                        # [H, W, C]
+    return t.permute(2, 0, 1)                        # [C, H, W]  channels-first
+
+
+def _build_pi0_config_obj(framework_cfg):
+    """
+    从 starVLA 的 framework 配置节点中构造 PI0Pytorch 所需的轻量配置对象。
+
+    PI0Pytorch.__init__ 需要的字段：
+        - paligemma_variant   : str  (e.g. "gemma_2b")
+        - action_expert_variant: str (e.g. "gemma_300m")
+        - pi05                : bool
+        - action_dim          : int
+        - action_horizon      : int
+        - dtype               : str  (e.g. "bfloat16")
+
+    这些字段可写在 yaml 的 framework.pi0 节点下，缺省值来自 openpi 的 Pi0Config。
+    """
+    pi0_node = getattr(framework_cfg, "pi0", None) or {}
+
+    def _get(key, default):
+        if hasattr(pi0_node, key):
+            return getattr(pi0_node, key)
+        if isinstance(pi0_node, dict):
+            return pi0_node.get(key, default)
+        return default
+
+    class _PI0Config:
+        paligemma_variant    = _get("paligemma_variant",    "gemma_2b")
+        action_expert_variant= _get("action_expert_variant","gemma_300m")
+        pi05                 = _get("pi05",                 False)
+        action_dim           = _get("action_dim",           37)
+        state_dim            = _get("state_dim",            74)
+        action_horizon       = _get("action_horizon",       15)
+        dtype                = _get("dtype",                "bfloat16")
+        # max_token_len 按 pi05 标志自动推断（与 Pi0Config.__post_init__ 保持一致）
+        _max_token_len_raw   = _get("max_token_len",        None)
+
+        @property
+        def max_token_len(self):
+            if self._max_token_len_raw is not None:
+                return self._max_token_len_raw
+            return 200 if self.pi05 else 48
+
+    return _PI0Config()
+
+
+class _SentencePieceTokenizer:
+    """
+    轻量封装 SentencePiece tokenizer，用于 PaliGemma 风格的提示词编码。
+
+    与 openpi.models.tokenizer.PaligemmaTokenizer 逻辑相同，
+    但不依赖 openpi 的 GCS 下载逻辑，改为接受本地文件路径。
+    """
+
+    def __init__(self, model_path: str, max_len: int = 48):
+        import sentencepiece
+        self._max_len = max_len
+        with open(model_path, "rb") as f:
+            self._sp = sentencepiece.SentencePieceProcessor(model_proto=f.read())
+
+    def tokenize(self, prompt: str, state: Optional[np.ndarray] = None):
+        """
+        返回 (tokens: np.ndarray[int32], mask: np.ndarray[bool])，长度均为 max_len。
+        """
+        cleaned = prompt.strip().replace("_", " ").replace("\n", " ")
+        if state is not None:
+            # pi05 格式：将连续 state 离散化后拼入提示词
+            bins = np.linspace(-1, 1, 257)[:-1]
+            disc = np.digitize(state, bins=bins) - 1
+            state_str = " ".join(map(str, disc))
+            full_prompt = f"Task: {cleaned}, State: {state_str};\nAction: "
+            tokens = self._sp.encode(full_prompt, add_bos=True)
+        else:
+            tokens = self._sp.encode(cleaned, add_bos=True) + self._sp.encode("\n")
+
+        tokens_len = len(tokens)
+        if tokens_len < self._max_len:
+            pad = [False] * (self._max_len - tokens_len)
+            mask = [True] * tokens_len + pad
+            tokens = tokens + [0] * (self._max_len - tokens_len)
+        else:
+            if tokens_len > self._max_len:
+                logger.warning(
+                    f"Token length ({tokens_len}) > max_len ({self._max_len}), truncating."
+                )
+            tokens = tokens[: self._max_len]
+            mask = [True] * self._max_len
+
+        return np.array(tokens, dtype=np.int32), np.array(mask, dtype=bool)
+
+
+# ────────────────────────────────────────────────────────────────
+# PI0 Framework
+# ────────────────────────────────────────────────────────────────
+
+@FRAMEWORK_REGISTRY.register("PI0")
+class PI0Framework(baseframework):
+    """
+    starVLA framework 封装，将 openpi 的 PI0Pytorch 模型集成到 starVLA 生态。
+
+    Config 节点（yaml 示例）：
+    ```yaml
+    framework:
+      name: "PI0"
+      pi0:
+        paligemma_variant:    "gemma_2b"       # PaliGemma backbone 变体
+        action_expert_variant: "gemma_300m"    # Action Expert 变体
+        pi05:                 false            # 是否使用 Pi0.5 版本
+        action_dim:           32               # 动作维度
+        action_horizon:       50               # 预测动作步数
+        dtype:                "bfloat16"       # 模型权重精度
+      tokenizer_path: "/path/to/paligemma_tokenizer.model"   # SentencePiece 模型
+      pi0_checkpoint: "/path/to/model.safetensors"           # 可选，pi0 预训练权重
+      image_keys:                                            # 图像键名顺序（对应 examples["image"] 中的顺序）
+        - "base_0_rgb"
+        - "left_wrist_0_rgb"
+        - "right_wrist_0_rgb"
+      num_inference_steps: 10                                # 流匹配推理步数
+    ```
+
+    接口：
+        __init__(config)         : 构建模型、加载 tokenizer、可选加载预训练权重
+        predict_action(examples) : 推理，返回 {"normalized_actions": np.ndarray}
+        forward(examples)        : 训练前向（暂未实现）
+    """
+
+    def __init__(
+        self,
+        config: Optional[Any] = None,
+        **kwargs,
+    ) -> None:
+        """
+        初始化 PI0Framework。
+
+        Args:
+            config: starVLA 层级配置（OmegaConf DictConfig 或兼容对象），
+                    须包含 framework.pi0、framework.tokenizer_path 等字段。
+            **kwargs: 预留。
+        """
+        super().__init__()
+        self.config = config
+
+        fw_cfg = getattr(config, "framework", config)
+
+        # ── 1. 构建 PI0Pytorch 配置 ──────────────────────────────
+        pi0_cfg = _build_pi0_config_obj(fw_cfg)
+        self._pi0_cfg = pi0_cfg
+
+        # ── 2. 初始化 PI0Pytorch 模型 ────────────────────────────
+        try:
+            sys.path.append("/mnt/data/fangyu/code/MixtureOfHorizons/src")
+            from openpi.models_pytorch.pi0_pytorch import PI0Pytorch
+        except ImportError as e:
+            raise ImportError(
+                "PI0Framework 依赖 openpi 包。请确保 openpi 已安装或 "
+                "openpi/src 已添加到 PYTHONPATH。原始错误: " + str(e)
+            ) from e
+
+        self.pi0_model: nn.Module = PI0Pytorch(config=pi0_cfg)
+        logger.info(
+            f"PI0Pytorch 已初始化：variant={pi0_cfg.paligemma_variant}, "
+            f"action_dim={pi0_cfg.action_dim}, action_horizon={pi0_cfg.action_horizon}, "
+            f"pi05={pi0_cfg.pi05}"
+        )
+
+        # ── 2b. 替换硬编码的 32D 投影层 ────────────────────────────
+        # PI0Pytorch 源码中 action_in_proj / action_out_proj / state_proj 均硬编码为 Linear(32, ...)，
+        # 不读取 config.action_dim / config.state_dim。
+        # action_in_proj / action_out_proj 按 action_dim 替换；
+        # state_proj 按 state_dim 替换（state_dim 可与 action_dim 不同，如 unified 74D state）。
+        # 替换后的层为随机初始化，加载 checkpoint 时会因 shape 不匹配而自动跳过（由 _filter_ckpt_by_shape 保证）。
+        self._replace_pi0_projection_layers(pi0_cfg.action_dim, pi0_cfg.state_dim)
+
+        # ── 3. 图像键名映射 ──────────────────────────────────────
+        _ik = getattr(fw_cfg, "image_keys", None)
+        if _ik is not None:
+            self.image_keys = list(_ik)
+        else:
+            self.image_keys = list(_DEFAULT_IMAGE_KEYS)
+
+        # ── 4. 推理步数 ──────────────────────────────────────────
+        self.num_inference_steps = getattr(fw_cfg, "num_inference_steps", 10)
+
+        # ── 4b. 有效动作维度（用于截断模型输出）────────────────────
+        # PI0 action_dim=32，但 LIBERO 实际只用前 7 维（3 pos + 3 rot + 1 gripper）。
+        # 若 config.framework.effective_action_dim 已设置，predict_action 会将输出
+        # 截断至前 N 维，以匹配 model2libero_interface.py 的 unnormalize 期望维度。
+        self.effective_action_dim = getattr(fw_cfg, "effective_action_dim", None)
+
+        # ── 4c. 单视角复制（用于 gr1 video.ego_view 等仅单视角的 dataset）────────
+        # 若 True，当 example["image"] 只有 1 张时，复制到 image_keys 数量以填充多视角
+        self._replicate_single_view = getattr(fw_cfg, "replicate_single_view", False)
+
+        # ── 4d. 是否使用 state 输入（训练/推理时）────────────────────────────
+        # 若 False，不读取 example["state"]，tokenizer 与 Observation.state 均用 None/零
+        self._use_state = getattr(fw_cfg, "use_state", True)
+
+        # ── 4e. 动态视角数（可选）────────────────────────────────────────────
+        # 若 True，根据 example["image"] 的实际数量使用前 N 个 image_keys，不补零
+        # 若 False，固定使用全部 image_keys，不足的视角用零+mask=False 填充
+        self._dynamic_image_keys = getattr(fw_cfg, "dynamic_image_keys", False)
+
+        # ── 5. 设置 Tokenizer ────────────────────────────────────
+        tokenizer_path = getattr(fw_cfg, "tokenizer_path", None)
+        self._tokenizer = self._load_tokenizer(tokenizer_path, pi0_cfg.max_token_len)
+
+        # ── 6. 可选：加载 pi0 预训练权重 ─────────────────────────
+        pi0_ckpt = getattr(fw_cfg, "pi0_checkpoint", None)
+        if pi0_ckpt:
+            self.load_pi0_weights(pi0_ckpt)
+
+    # ──────────────────────────────────────────────────────────────
+    # 内部工具
+    # ──────────────────────────────────────────────────────────────
+
+    def _load_tokenizer(self, tokenizer_path: Optional[str], max_len: int):
+        """
+        加载 PaliGemma SentencePiece tokenizer。
+
+        优先级：
+          1. 使用 tokenizer_path 指定的本地 .model 文件
+          2. 尝试通过 openpi 的下载工具自动获取 paligemma_tokenizer.model
+          3. 若均失败，tokenizer 设为 None，predict_action 时会报错提示用户
+
+        Args:
+            tokenizer_path: 本地 sentencepiece .model 文件路径，可为 None。
+            max_len: 最大 token 长度。
+
+        Returns:
+            _SentencePieceTokenizer 实例，或 None。
+        """
+        if tokenizer_path and Path(tokenizer_path).exists():
+            logger.info(f"从本地路径加载 tokenizer：{tokenizer_path}")
+            return _SentencePieceTokenizer(tokenizer_path, max_len=max_len)
+
+        # 尝试使用 openpi 的下载工具（会缓存到本地）
+        try:
+            from openpi.shared import download as _download
+            path = _download.maybe_download(
+                "gs://big_vision/paligemma_tokenizer.model", gs={"token": "anon"}
+            )
+            logger.info(f"通过 openpi 下载工具加载 tokenizer：{path}")
+            return _SentencePieceTokenizer(str(path), max_len=max_len)
+        except Exception as e:
+            logger.warning(
+                f"无法自动下载 paligemma tokenizer：{e}。"
+                "请在 config 中设置 framework.tokenizer_path 指向本地 .model 文件。"
+            )
+            return None
+
+    def _preprocess_examples(self, examples: List[dict], device: torch.device):
+        """
+        将 starVLA 样本格式转换为 PI0 Observation 对象。
+
+        starVLA 样本格式：
+            examples[i]["image"]  : List[PIL.Image]  —— 各视角图像
+            examples[i]["lang"]   : str              —— 语言指令
+            examples[i]["state"]  : np.ndarray (可选) —— 机器人本体状态，shape (1, state_dim)
+
+        PI0 Observation 格式：
+            images             : dict[key -> Tensor[B, H, W, C]], 值域 [-1, 1]
+            image_masks        : dict[key -> Tensor[B]], bool
+            state              : Tensor[B, action_dim]
+            tokenized_prompt   : Tensor[B, max_token_len], int32
+            tokenized_prompt_mask : Tensor[B, max_token_len], bool
+
+        Args:
+            examples: List[dict]，每个 dict 为一个样本。
+            device: 目标 torch 设备。
+
+        Returns:
+            Observation 对象（openpi.models.model.Observation）。
+        """
+        from openpi.models.model import Observation
+
+        batch_size = len(examples)
+
+        # ── 图像 ────────────────────────────────────────────────
+        # 视角数可配置：image_keys 长度决定最大视角数；dynamic_image_keys 时按实际图像数截断
+        replicate_single_view = getattr(self, "_replicate_single_view", False)
+        dynamic_image_keys = getattr(self, "_dynamic_image_keys", False)
+
+        # 确定本 batch 使用的 keys：dynamic 时以首样本图像数为准
+        num_views = len(examples[0].get("image", [])) if examples else len(self.image_keys)
+        if replicate_single_view and num_views == 1 and len(self.image_keys) > 1:
+            num_views = len(self.image_keys)
+        if dynamic_image_keys:
+            active_keys = list(self.image_keys)[: max(1, num_views)]
+        else:
+            active_keys = list(self.image_keys)
+
+        images_batch: Dict[str, List[torch.Tensor]] = {k: [] for k in active_keys}
+        masks_batch:  Dict[str, List[bool]] = {k: [] for k in active_keys}
+
+        for example in examples:
+            imgs: List[Image.Image] = example.get("image", [])
+            if replicate_single_view and len(imgs) == 1 and len(active_keys) > 1:
+                imgs = imgs * len(active_keys)
+            for idx, key in enumerate(active_keys):
+                if idx < len(imgs):
+                    t = _pil_to_tensor_normalized(imgs[idx], _IMAGE_RESOLUTION)
+                    images_batch[key].append(t)
+                    masks_batch[key].append(True)
+                else:
+                    # 缺失视角：用全零占位，mask=False
+                    t = torch.zeros(
+                        (3, _IMAGE_RESOLUTION[0], _IMAGE_RESOLUTION[1]), dtype=torch.float32
+                    )
+                    images_batch[key].append(t)
+                    masks_batch[key].append(False)
+
+        images_tensor: Dict[str, torch.Tensor] = {
+            k: torch.stack(v, dim=0).to(device, dtype=torch.float32)   # [B, C, H, W]
+            for k, v in images_batch.items()
+        }
+        image_masks_tensor: Dict[str, torch.Tensor] = {
+            k: torch.tensor(v, dtype=torch.bool, device=device)        # [B]
+            for k, v in masks_batch.items()
+        }
+
+        # ── 语言 Tokenization ───────────────────────────────────
+        if self._tokenizer is None:
+            raise RuntimeError(
+                "Tokenizer 未初始化。请在 config 中设置 framework.tokenizer_path，"
+                "或确保 openpi 网络可访问以自动下载。"
+            )
+
+        pi05 = self._pi0_cfg.pi05
+        use_state = getattr(self, "_use_state", True)
+        _pi05_missing_state_warned = False
+        all_tokens = []
+        all_masks  = []
+        for example in examples:
+            lang  = example.get("lang", example.get("language", ""))
+            state_for_tok = None
+            if pi05 and use_state:
+                # pi0.5：将 state 离散化后并入提示词
+                # ⚠️  当 use_state=False 时，不喂 state，tokenizer 使用 non-pi05 格式
+                raw_state = example.get("state", None)
+                if raw_state is not None:
+                    s = np.array(raw_state)
+                    if s.ndim > 1:
+                        s = s[0]
+                    s = s.flatten()[: self._pi0_cfg.state_dim]
+                    state_for_tok = s
+                elif not _pi05_missing_state_warned:
+                    # logger.warning(
+                    #     "PI0Framework [pi05=True]: example 中没有 'state' 字段！"
+                    #     "Tokenizer 将退回 non-pi05 prompt 格式，与训练格式不符，"
+                    #     "可能显著降低模型性能。"
+                    #     "请在 example_dict 中添加 'state': robot_state_array。"
+                    # )
+                    _pi05_missing_state_warned = True
+            toks, mask = self._tokenizer.tokenize(lang, state_for_tok)
+            all_tokens.append(toks)
+            all_masks.append(mask)
+
+        tokenized_prompt = torch.tensor(
+            np.stack(all_tokens, axis=0), dtype=torch.int32, device=device
+        )  # [B, max_token_len]
+        tokenized_prompt_mask = torch.tensor(
+            np.stack(all_masks, axis=0), dtype=torch.bool, device=device
+        )  # [B, max_token_len]
+
+        # ── State ───────────────────────────────────────────────
+        # 当 use_state=False 时不读取 example["state"]，全部填零。
+        # state 对齐目标为 state_dim（而非 action_dim），两者可以不同，
+        # 例如 unified 74D state + 37D action 场景下 state_proj 为 Linear(74, width)，
+        # state_tensor shape 为 [B, 74]，不截断。
+        state_dim = self._pi0_cfg.state_dim
+        state_list = []
+        for example in examples:
+            raw = example.get("state", None) if use_state else None
+            if raw is not None:
+                s = np.array(raw, dtype=np.float32)
+                if s.ndim > 1:
+                    s = s[0]  # 取首帧 (chunk 时 state 为 [T, state_dim])
+                s = s.flatten()
+            else:
+                s = np.zeros(state_dim, dtype=np.float32)
+            # 对齐到 state_dim（截断或 zero-pad）
+            if len(s) >= state_dim:
+                s = s[:state_dim]
+            else:
+                s = np.concatenate([s, np.zeros(state_dim - len(s), dtype=np.float32)])
+            state_list.append(s)
+        state_tensor = torch.tensor(
+            np.stack(state_list, axis=0), dtype=torch.float32, device=device
+        )  # [B, state_dim]
+
+        return Observation(
+            images=images_tensor,
+            image_masks=image_masks_tensor,
+            state=state_tensor,
+            tokenized_prompt=tokenized_prompt,
+            tokenized_prompt_mask=tokenized_prompt_mask,
+        )
+
+    # ──────────────────────────────────────────────────────────────
+    # 公开接口
+    # ──────────────────────────────────────────────────────────────
+
+    # ──────────────────────────────────────────────────────────────
+    # load_state_dict 重写（解决 key 前缀不匹配问题）
+    # ──────────────────────────────────────────────────────────────
+
+    def load_state_dict(self, state_dict, strict=True, assign=False):
+        """
+        重写 load_state_dict，解决 PI0 checkpoint key 前缀不匹配问题。
+
+        **问题根因：**
+        `baseframework.from_pretrained` 调用 `FrameworkModel.load_state_dict(ckpt_dict, strict=True)`。
+        PI0Framework 的 state_dict key 带 `pi0_model.` 前缀（`self.pi0_model` 是子模块），
+        但 `convert_jax_model_to_pytorch.py` 输出的 safetensors 是裸 key（无前缀）。
+        直接 `load_state_dict` 严格模式必然失败。
+
+        **修复：**
+        检测到 state_dict 中均为裸 key（不含 `pi0_model.` 前缀）时，
+        自动将权重加载到 `self.pi0_model`（绕过 `PI0Framework` 层的前缀问题）。
+        """
+        if state_dict and not any(k.startswith("pi0_model.") for k in state_dict.keys()):
+            logger.info(
+                "[PI0Framework.load_state_dict] 检测到裸 key（pi05_libero_pytorch 格式），"
+                "直接加载到 self.pi0_model（跳过 pi0_model. 前缀）"
+            )
+            # PaliGemma 中 embed_tokens.weight 与 lm_head.weight 权重绑定（同一 tensor 对象），
+            # convert_jax_model_to_pytorch.py 只保存了 lm_head.weight，不重复保存 embed_tokens.weight。
+            # 加载 lm_head.weight 后，绑定机制自动同步 embed_tokens.weight，用 strict=False 跳过此检查。
+            missing, unexpected = self.pi0_model.load_state_dict(state_dict, strict=False)
+            expected_missing = {"paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"}
+            real_missing = set(missing) - expected_missing
+            if real_missing:
+                logger.warning(f"[PI0Framework.load_state_dict] 真正缺失的 key（非权重绑定）：{real_missing}")
+            if unexpected:
+                logger.warning(f"[PI0Framework.load_state_dict] 多余的 key：{set(unexpected)}")
+            logger.info("[PI0Framework.load_state_dict] pi0 权重加载完成（embed_tokens.weight 由权重绑定自动同步）")
+            from torch.nn.modules.module import _IncompatibleKeys
+            return _IncompatibleKeys(missing, unexpected)
+        return super().load_state_dict(state_dict, strict=strict, assign=assign)
+
+    # ──────────────────────────────────────────────────────────────
+    # from_pretrained 重写（补充支持直接 pi05_libero_pytorch 目录格式）
+    # ──────────────────────────────────────────────────────────────
+
+    @classmethod
+    def from_pretrained(cls, pretrained_checkpoint: str, **kwargs):
+        """
+        从 checkpoint 加载 PI0Framework，供 server_policy.py 调用。
+
+        **为什么需要重写？**
+        `baseframework.from_pretrained` 最终调用 `FrameworkModel.load_state_dict(ckpt_dict)`，
+        而 PI0Framework 的 state_dict key 带 `pi0_model.` 前缀（因为 self.pi0_model 是子模块），
+        但 convert_jax_model_to_pytorch.py 输出的 safetensors 是裸 key（无前缀）。
+        直接 load_state_dict 会因 key 不匹配而失败。
+        本方法将权重直接加载到 self.pi0_model，绕过前缀问题。
+
+        支持两种 checkpoint 格式：
+
+        **格式 A：starVLA wrapper 目录**（推荐用于 server_policy.py 部署）
+        ```
+        <run_dir>/
+        ├── config.yaml               # framework.name=PI0, framework.pi0.*, framework.effective_action_dim=7
+        ├── dataset_statistics.json   # franka → action → min/max/mask（7 维，由 pi05_libero q01/q99 转换）
+        └── checkpoints/
+            └── model.safetensors     # 即 pi05_libero_pytorch/model.safetensors（可软链接）
+        ```
+        server_policy.py 启动命令：
+          `--ckpt_path <run_dir>/checkpoints/model.safetensors`
+
+        **格式 B：直接 pi05_libero_pytorch 目录**（不含 dataset_statistics，仅供快速测试）
+        ```
+        <pi0_dir>/
+        ├── model.safetensors         # convert_jax_model_to_pytorch.py 输出
+        └── config.json               # {"action_dim": 32, "action_horizon": 10, ...}
+        ```
+        注意：此格式下 norm_stats 为空，ModelClient.get_action_stats 会失败，
+        需要手动提供统计量或改用格式 A。
+
+        Args:
+            pretrained_checkpoint: checkpoint 文件路径（.safetensors 或 .pt）。
+
+        Returns:
+            PI0Framework 实例，已加载权重。
+        """
+        from pathlib import Path as _Path
+
+        pretrained_checkpoint = _Path(pretrained_checkpoint)
+
+        # ─── 格式 A：starVLA wrapper 目录 ─────────────────────────
+        wrapper_run_dir   = pretrained_checkpoint.parents[1]
+        wrapper_cfg_yaml  = wrapper_run_dir / "config.yaml"
+        wrapper_stats_json = wrapper_run_dir / "dataset_statistics.json"
+
+        if wrapper_cfg_yaml.exists() and wrapper_stats_json.exists():
+            from starVLA.model.framework.share_tools import read_mode_config, dict_to_namespace
+            from starVLA.model.framework import build_framework
+
+            model_config, norm_stats = read_mode_config(pretrained_checkpoint)
+            config = dict_to_namespace(model_config)
+            config.trainer.pretrained_checkpoint = None
+
+            # 防止 __init__ 内二次加载（若 config.yaml 也设了 pi0_checkpoint）
+            fw_cfg_ref = getattr(config, "framework", config)
+            if hasattr(fw_cfg_ref, "pi0_checkpoint"):
+                fw_cfg_ref.pi0_checkpoint = None
+
+            model = build_framework(cfg=config)
+            model.norm_stats = norm_stats
+
+            # 直接将 safetensors/pt 权重加载到 pi0_model（跳过 PI0Framework 前缀问题）
+            if pretrained_checkpoint.exists():
+                model.load_pi0_weights(str(pretrained_checkpoint))
+
+            logger.info(f"[from_pretrained 格式A] 从 wrapper 目录加载：{wrapper_run_dir}")
+            return model
+
+        # ─── 格式 B：直接 pi05_libero_pytorch 目录 ───────────────
+        pi0_dir        = pretrained_checkpoint.parent
+        pi0_config_json = pi0_dir / "config.json"
+
+        if pi0_config_json.exists():
+            import json
+            from omegaconf import OmegaConf
+
+            with open(pi0_config_json) as _f:
+                pi0_cfg_dict = json.load(_f)
+
+            # 尝试从 safetensors 的 key 自动判断 pi05
+            pi05_detected = False
+            if pretrained_checkpoint.suffix == ".safetensors":
+                try:
+                    import safetensors
+                    with safetensors.safe_open(str(pretrained_checkpoint), framework="pt") as _sf:
+                        _keys = list(_sf.keys())
+                    pi05_detected = "time_mlp_in.weight" in _keys
+                except Exception:
+                    pass
+
+            config = OmegaConf.create({
+                "framework": {
+                    "name": "PI0",
+                    "pi0": {
+                        "paligemma_variant":     pi0_cfg_dict.get("paligemma_variant", "gemma_2b"),
+                        "action_expert_variant": pi0_cfg_dict.get("action_expert_variant", "gemma_300m"),
+                        "pi05":                  pi05_detected,
+                        "action_dim":            pi0_cfg_dict.get("action_dim", 32),
+                        "action_horizon":        pi0_cfg_dict.get("action_horizon", 50),
+                        "dtype":                 pi0_cfg_dict.get("precision", "bfloat16"),
+                    },
+                    "pi0_checkpoint":     None,
+                    "num_inference_steps": 10,
+                },
+                "trainer": {"pretrained_checkpoint": None},
+            })
+
+            model = cls(config=config)
+            model.load_pi0_weights(str(pretrained_checkpoint))
+            model.norm_stats = {}
+
+            logger.warning(
+                "[from_pretrained 格式B] 直接加载 pi05_libero_pytorch 目录，"
+                "norm_stats 为空。ModelClient.get_action_stats 会失败。"
+                "建议创建 starVLA wrapper 目录（含 config.yaml + dataset_statistics.json）。"
+            )
+            logger.info(f"[from_pretrained 格式B] pi05_detected={pi05_detected}，路径：{pretrained_checkpoint}")
+            return model
+
+        # ─── Fallback ─────────────────────────────────────────────
+        logger.warning(
+            f"PI0Framework.from_pretrained：无法识别 checkpoint 格式（{pretrained_checkpoint}）。"
+            "请参考 docstring 创建 starVLA wrapper 目录（格式 A）。"
+            "尝试调用父类 from_pretrained（大概率因 key 不匹配而失败）。"
+        )
+        return super().from_pretrained(pretrained_checkpoint, **kwargs)
+
+    # ── PI0Pytorch 结构适配工具 ──────────────────────────────────────
+
+    # PI0Pytorch 源码中以下三个投影层硬编码为 32D，不读取 config.action_dim：
+    #   self.action_in_proj  = nn.Linear(32, width)
+    #   self.action_out_proj = nn.Linear(width, 32)
+    #   self.state_proj      = nn.Linear(32, width)   # pi05=False 时
+    _PI0_HARDCODED_ACTION_DIM: int = 32
+
+    def _replace_pi0_projection_layers(self, action_dim: int, state_dim: int = None) -> None:
+        """
+        将 PI0Pytorch 硬编码的 32D 投影层替换为目标维度。
+
+        - action_in_proj / action_out_proj 按 action_dim 替换。
+        - state_proj 按 state_dim 替换（state_dim 可与 action_dim 不同，
+          如 unified 74D state + 37D action 的场景）。
+        - 当某维度与硬编码的 32D 相同时，对应层不做替换。
+        - 替换后的层为随机初始化；加载 checkpoint 时，
+          `_filter_ckpt_by_shape` 会因 shape 不匹配而自动跳过这些 key。
+
+        Args:
+            action_dim: 动作维度（用于 action_in_proj / action_out_proj）。
+            state_dim:  状态维度（用于 state_proj）。缺省时与 action_dim 相同。
+        """
+        if state_dim is None:
+            state_dim = action_dim
+
+        # action_in_proj / action_out_proj 始终存在
+        if action_dim != self._PI0_HARDCODED_ACTION_DIM:
+            proj_width = self.pi0_model.action_in_proj.out_features  # e.g. 1024
+            self.pi0_model.action_in_proj  = nn.Linear(action_dim, proj_width)
+            self.pi0_model.action_out_proj = nn.Linear(proj_width,  action_dim)
+            logger.info(
+                f"[PI0Framework] action_in/out_proj 已替换："
+                f"Linear({self._PI0_HARDCODED_ACTION_DIM}, {proj_width}) "
+                f"→ Linear({action_dim}, {proj_width})  "
+                f"[随机初始化，不加载 checkpoint 权重]"
+            )
+
+        # state_proj 仅在 pi05=False 时存在
+        if not self._pi0_cfg.pi05 and hasattr(self.pi0_model, "state_proj"):
+            if state_dim != self._PI0_HARDCODED_ACTION_DIM:
+                state_proj_width = self.pi0_model.state_proj.out_features
+                self.pi0_model.state_proj = nn.Linear(state_dim, state_proj_width)
+                logger.info(
+                    f"[PI0Framework] state_proj 已替换："
+                    f"Linear({self._PI0_HARDCODED_ACTION_DIM}, {state_proj_width}) "
+                    f"→ Linear({state_dim}, {state_proj_width})  "
+                    f"[随机初始化，不加载 checkpoint 权重]"
+                )
+
+    def _filter_ckpt_by_shape(self, state_dict: dict) -> dict:
+        """
+        过滤 checkpoint state_dict，移除与当前模型 shape 不一致的 key。
+
+        load_state_dict(strict=False) 遇到 shape 不匹配时仍会抛出 RuntimeError；
+        本方法提前过滤，让这些 key 保持随机初始化，其余 key 正常加载。
+
+        Args:
+            state_dict: 从文件读取的原始权重字典（key 为裸名，不含前缀）。
+
+        Returns:
+            过滤后的字典，只保留 shape 匹配（或模型中不存在）的 key。
+        """
+        model_sd = self.pi0_model.state_dict()
+        filtered: dict = {}
+        skipped: list  = []
+
+        for k, v in state_dict.items():
+            if k in model_sd and model_sd[k].shape != v.shape:
+                skipped.append(
+                    f"  {k}: ckpt{tuple(v.shape)} ≠ model{tuple(model_sd[k].shape)}"
+                )
+            else:
+                filtered[k] = v
+
+        if skipped:
+            logger.info(
+                f"[load_pi0_weights] 跳过 {len(skipped)} 个 shape 不匹配的 key"
+                f"（这些层保持随机初始化，将在训练中从头学习）："
+            )
+            for s in skipped:
+                logger.info(s)
+
+        return filtered
+
+    def load_pi0_weights(self, checkpoint_path: str) -> None:
+        """
+        从 pi0 预训练 checkpoint 加载权重到 self.pi0_model。
+
+        支持格式：
+          - .safetensors : 使用 safetensors.torch.load_file + 过滤后 load_state_dict
+          - .pt / .pth   : 使用 torch.load（map_location="cpu"）
+
+        当 config.action_dim 与 checkpoint 的硬编码维度（32）不一致时，
+        _filter_ckpt_by_shape 会自动跳过 action_in_proj / action_out_proj / state_proj，
+        这些层保持 _replace_pi0_projection_layers 的随机初始化，由训练自行学习。
+
+        Args:
+            checkpoint_path: 权重文件路径。
+
+        Raises:
+            FileNotFoundError: 文件不存在。
+            RuntimeError: 权重加载失败。
+        """
+        checkpoint_path = Path(checkpoint_path)
+        if not checkpoint_path.exists():
+            raise FileNotFoundError(f"pi0 checkpoint 不存在：{checkpoint_path}")
+
+        logger.info(f"加载 pi0 预训练权重：{checkpoint_path}")
+
+        if checkpoint_path.suffix == ".safetensors":
+            import safetensors.torch as sf_torch
+            state_dict = sf_torch.load_file(str(checkpoint_path))
+            state_dict = self._filter_ckpt_by_shape(state_dict)
+            missing, unexpected = self.pi0_model.load_state_dict(state_dict, strict=False)
+            # embed_tokens.weight 与 lm_head.weight 绑定，只保存一份属正常现象
+            expected_missing = {
+                "paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
+            }
+            real_missing = set(missing) - expected_missing
+            if real_missing:
+                logger.warning(f"加载后真正缺失的 key（{len(real_missing)} 个）：{list(real_missing)[:10]} ...")
+            if unexpected:
+                logger.warning(f"加载后多余的 key（{len(unexpected)} 个）：{list(unexpected)[:10]} ...")
+        else:
+            state_dict = torch.load(str(checkpoint_path), map_location="cpu")
+            # 兼容外层包装（有些 checkpoint 会多套一层 key）
+            if "model" in state_dict and isinstance(state_dict["model"], dict):
+                state_dict = state_dict["model"]
+            state_dict = self._filter_ckpt_by_shape(state_dict)
+            model_keys      = set(self.pi0_model.state_dict().keys())
+            checkpoint_keys = set(state_dict.keys())
+            missing     = model_keys - checkpoint_keys
+            unexpected  = checkpoint_keys - model_keys
+            if missing:
+                logger.warning(f"权重中缺少的 key（{len(missing)} 个）：{list(missing)[:10]} ...")
+            if unexpected:
+                logger.warning(f"权重中多余的 key（{len(unexpected)} 个）：{list(unexpected)[:10]} ...")
+            self.pi0_model.load_state_dict(state_dict, strict=False)
+
+        logger.info("pi0 预训练权重加载完毕。")
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> dict:
+        """
+        训练前向传播，计算流匹配（flow matching）损失。
+
+        基于 PI0Pytorch.forward 的训练逻辑：
+          1. 从 examples 中读取 image / lang / action / state
+          2. 将 action pad 到 model 的 action_dim（如 32），不足的维度补零
+          3. 通过 _preprocess_examples 构建 PI0 Observation 对象
+          4. 调用 PI0Pytorch.forward(observation, actions_target)：
+               - 采样 noise 和 time
+               - 线性插值加噪：x_t = t * noise + (1-t) * actions
+               - 目标速度：u_t = noise - actions
+               - 预测速度：v_t = model(x_t, t)
+               - 损失：MSE(u_t, v_t) per element → 再取均值
+          5. 返回 {"action_loss": scalar}
+
+        Args:
+            examples: List[dict]，每个 dict 包含：
+                - "image"  (List[PIL.Image | np.ndarray]): 各视角图像
+                - "lang"   (str): 语言指令
+                - "action" (np.ndarray): 目标动作，shape (action_horizon, D)，
+                    D 通常是 effective_action_dim（如 7），不足 action_dim 会自动 zero-pad
+                - "state"  (np.ndarray, 可选): 机器人本体状态
+            **kwargs: 预留，暂未使用。
+
+        Returns:
+            dict:
+                "action_loss" (torch.Tensor): 标量，batch 平均流匹配 MSE 损失。
+        """
+        if not isinstance(examples, list):
+            examples = [examples]
+
+        device = next(self.pi0_model.parameters()).device
+        action_dim     = self._pi0_cfg.action_dim      # 32（模型全维度）
+        action_horizon = self._pi0_cfg.action_horizon   # 如 10
+
+        # ── 1. 构建 PI0 Observation ────────────────────────────────
+        observation = self._preprocess_examples(examples, device)
+
+        # ── 2. 整理 action target，pad 到 action_dim ───────────────
+        # examples[i]["action"]: np.ndarray (action_horizon, D)，D 可能 < action_dim
+        actions_list = []
+        for example in examples:
+            a = np.array(example["action"], dtype=np.float32)
+
+            # 若缺少 horizon 维，视为单步
+            if a.ndim == 1:
+                a = a[np.newaxis, :]  # (1, D)
+
+            # action_horizon 对齐（截断或 zero-pad）
+            H, D = a.shape
+            if H > action_horizon:
+                a = a[:action_horizon]
+            elif H < action_horizon:
+                a = np.concatenate(
+                    [a, np.zeros((action_horizon - H, D), dtype=np.float32)], axis=0
+                )
+
+            # action_dim 对齐（截断或 zero-pad）
+            D = a.shape[1]
+            if D > action_dim:
+                a = a[:, :action_dim]
+            elif D < action_dim:
+                a = np.concatenate(
+                    [a, np.zeros((action_horizon, action_dim - D), dtype=np.float32)], axis=1
+                )
+
+            actions_list.append(a)
+
+        actions_target = torch.tensor(
+            np.stack(actions_list, axis=0), dtype=torch.float32, device=device
+        )  # [B, action_horizon, action_dim]
+
+        # ── 3. 调用 PI0Pytorch.forward 计算流匹配损失 ──────────────
+        # PI0Pytorch.forward 返回 F.mse_loss(u_t, v_t, reduction="none")
+        # shape: [B, action_horizon, action_dim]
+        # 内部会自动处理 bfloat16 cast（对 embedding 层）并以 float32 输出 loss
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            loss_per_element = self.pi0_model.forward(observation, actions_target)
+
+        # 对 batch / horizon / dim 全部取均值，得到标量损失
+        action_loss = loss_per_element.mean()
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict],
+        **kwargs,
+    ) -> dict:
+        """
+        推理：根据观测预测动作序列。
+
+        流程：
+          1. 将 examples 中的图像转换为归一化张量
+          2. Tokenize 语言指令
+          3. 提取/对齐本体状态
+          4. 构造 PI0 Observation 对象
+          5. 调用 PI0Pytorch.sample_actions 进行流匹配去噪推理
+          6. 返回归一化动作
+
+        Args:
+            examples: List[dict]，每个 dict 包含：
+                - "image" (List[PIL.Image | np.ndarray]): 各视角图像，顺序与 config.framework.image_keys 对应。
+                    接受 PIL Image 或 (H, W, 3) uint8 numpy 数组（eval_libero.py 格式）。
+                - "lang"  (str)            : 任务语言指令
+                - "state" (np.ndarray, 可选): 机器人本体状态，shape (state_dim,) 或 (1, state_dim)。
+                    ⚠️  pi05 模式下 state 是 tokenized prompt 的一部分，缺失会导致 prompt 格式
+                    错误、性能下降。eval_libero.py 需要在 example_dict 中添加此字段。
+            **kwargs: 额外参数（暂未使用）。
+
+        Returns:
+            dict:
+                "normalized_actions" (np.ndarray): shape [B, action_horizon, action_dim]，
+                    归一化至 [-1, 1] 的预测动作序列。
+        """
+        if not isinstance(examples, list):
+            examples = [examples]
+
+        # 确定运行设备
+        device = next(self.pi0_model.parameters()).device
+
+        # 准备 PI0 Observation
+        observation = self._preprocess_examples(examples, device)
+
+        # 执行流匹配去噪推理
+        num_steps = kwargs.get("num_steps", self.num_inference_steps)
+        pred_actions = self.pi0_model.sample_actions(
+            device, observation, num_steps=num_steps
+        )  # [B, action_horizon, action_dim]
+
+        normalized_actions = pred_actions.detach().cpu().numpy()   # [B, action_horizon, action_dim]
+
+        # 截断到有效动作维度（用于 LIBERO 等只需要前 N 维的场景）
+        # PI0 pi05_libero action_dim=32，但 LIBERO 机器人只有 7 DOF
+        # ModelClient.unnormalize_actions 的 min/max stats 只有 7 维，
+        # 若不截断会导致 numpy broadcast 形状不匹配错误。
+        # 在 config.framework.effective_action_dim 中设置有效维度（如 7）来启用截断。
+        if self.effective_action_dim is not None:
+            normalized_actions = normalized_actions[:, :, : self.effective_action_dim]
+        # print(normalized_actions.shape)
+        return {"normalized_actions": normalized_actions}
+
+
+# ────────────────────────────────────────────────────────────────
+# 快速验证（仅供调试，不依赖真实 checkpoint）
+# ────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import argparse
+    from omegaconf import OmegaConf
+
+    # ── 默认路径（对应 pi05_libero 模型） ────────────────────────
+    _DEFAULT_CKPT = (
+        "/mnt/data/fangyu/model/openpi/openpi-assets/checkpoints"
+        "/pi05_libero_pytorch/model.safetensors"
+    )
+    _DEFAULT_TOKENIZER = (
+        "/root/.cache/openpi/big_vision/paligemma_tokenizer.model"
+    )
+
+    parser = argparse.ArgumentParser(description="PI0Framework 快速冒烟测试")
+    parser.add_argument("--config_yaml",    type=str, default=None,               help="YAML 配置文件路径（优先于内联配置）")
+    parser.add_argument("--pi0_checkpoint", type=str, default=_DEFAULT_CKPT,      help="pi0 safetensors 权重路径")
+    parser.add_argument("--tokenizer_path", type=str, default=_DEFAULT_TOKENIZER, help="PaliGemma tokenizer .model 路径")
+    parser.add_argument("--device",         type=str, default=None,               help="运行设备，如 cuda:0 / cpu（默认自动检测）")
+    parser.add_argument("--steps",          type=int, default=10,                 help="流匹配推理步数")
+    parser.add_argument("--batch_size",     type=int, default=1,                  help="测试 batch size")
+    args, _ = parser.parse_known_args()
+
+    if args.config_yaml:
+        cfg = OmegaConf.load(args.config_yaml)
+    else:
+        # pi05_libero 内联配置
+        cfg = OmegaConf.create({
+            "framework": {
+                "name": "PI0",
+                "pi0": {
+                    "paligemma_variant":     "gemma_2b",
+                    "action_expert_variant": "gemma_300m",
+                    "pi05":                  True,   # pi05_libero 使用 pi0.5
+                    "action_dim":            32,
+                    "action_horizon":        10,     # pi05_libero action_horizon=10
+                    "dtype":                 "bfloat16",
+                },
+                "tokenizer_path":      args.tokenizer_path,
+                "pi0_checkpoint":      args.pi0_checkpoint,
+                "image_keys":          ["base_0_rgb", "left_wrist_0_rgb", "right_wrist_0_rgb"],
+                "num_inference_steps": args.steps,
+            }
+        })
+
+    print("=" * 60)
+    print("PI0Framework 测试")
+    print("=" * 60)
+    print(f"  checkpoint   : {args.pi0_checkpoint}")
+    print(f"  tokenizer    : {args.tokenizer_path}")
+    print(f"  infer steps  : {args.steps}")
+    print(f"  batch size   : {args.batch_size}")
+
+    # ── 1. 构建模型 ──────────────────────────────────────────────
+    print("\n[1/3] 初始化 PI0Framework 并加载权重...")
+    model = PI0Framework(cfg)
+    total_params = sum(p.numel() for p in model.pi0_model.parameters())
+    print(f"  模型参数量: {total_params / 1e9:.2f}B")
+
+    # ── 2. 移到目标设备 ──────────────────────────────────────────
+    if args.device:
+        device = torch.device(args.device)
+    else:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"\n[2/3] 移至设备: {device}")
+    model = model.to(device)
+    # pi05_libero 用 bfloat16
+    model.pi0_model.paligemma_with_expert.to_bfloat16_for_selected_params("bfloat16")
+
+    # ── 3. 构造假样本并推理 ──────────────────────────────────────
+    print(f"\n[3/3] 构造 batch_size={args.batch_size} 的假样本并调用 predict_action...")
+    fake_img = Image.fromarray(
+        np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8)
+    )
+    sample = {
+        "image": [fake_img, fake_img, fake_img],   # 3 视角：base + left_wrist + right_wrist
+        "lang":  "put the red cup on the plate",
+        "state": np.random.uniform(-1, 1, size=(32,)).astype(np.float32),
+    }
+    batch = [sample] * args.batch_size
+
+    import time
+    t0 = time.time()
+    result = model.predict_action(batch)
+    elapsed = time.time() - t0
+
+    actions = result["normalized_actions"]
+    print(f"\n  输出 normalized_actions shape : {actions.shape}")
+    print(f"  推理耗时                      : {elapsed*1000:.1f} ms")
+    print(f"  动作值域 [min, max]            : [{actions.min():.4f}, {actions.max():.4f}]")
+    print(f"  动作均值 ± 标准差              : {actions.mean():.4f} ± {actions.std():.4f}")
+    print("\n[OK] PI0Framework 测试完成！")
diff --git a/code/model/framework/QwenAdapter.py b/code/model/framework/QwenAdapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a3ced5e3b6f7a631969b41cc0219a46787a5a2d6
--- /dev/null
+++ b/code/model/framework/QwenAdapter.py
@@ -0,0 +1,533 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Junqiu YU / Fudan University] in [2025]. 
+# Design and Merged by [Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-Adapter Framework
+A lightweight implementation that Qwen-VL + Adapter Action head to directly predict continuous actions
+Action head is copyright from VLA-Adapter,
+"""
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.VLA_AdapterHeader import get_action_model, VLA_Adapter_L1RegressionActionHead
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+from starVLA.model.modules.vlm.QWen3 import IMAGE_TOKEN_INDEX, VIDEO_TOKEN_INDEX
+
+def get_image_token_counts(batch_inputs):
+    IMAGE_TOKEN_ID = IMAGE_TOKEN_INDEX 
+    
+    # input_ids shape: [Batch_Size, Seq_Len]
+    # result shape: [Batch_Size]
+    num_tokens_per_sample = torch.sum(batch_inputs['input_ids'] == IMAGE_TOKEN_ID, dim=1)
+    # also get the last index of the image token for each sample if needed
+    last_index_per_sample = (batch_inputs['input_ids'] == IMAGE_TOKEN_ID).int().cumsum(dim=1).argmax(dim=1)
+    # also get the first index of the image token for each sample if needed
+    first_index_per_sample = (batch_inputs['input_ids'] == IMAGE_TOKEN_ID).int().cumsum(dim=1).argmin(dim=1)
+    
+    return num_tokens_per_sample, first_index_per_sample, last_index_per_sample
+
+
+class ProprioProjector(nn.Module):
+    """
+    Projects proprio state inputs into the LLM's embedding space.
+    """
+    def __init__(self, llm_dim: int, proprio_dim: int) -> None:
+        super().__init__()
+        self.llm_dim = llm_dim
+        self.proprio_dim = proprio_dim
+
+        self.fc1 = nn.Linear(self.proprio_dim, self.llm_dim, bias=True)
+        self.fc2 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
+        self.act_fn1 = nn.GELU()
+
+    def forward(self, proprio: torch.Tensor = None) -> torch.Tensor:
+        # proprio: (bsz, proprio_dim)
+        projected_features = self.fc1(proprio)
+        projected_features = self.act_fn1(projected_features)
+        projected_features = self.fc2(projected_features)
+        return projected_features
+
+# Only support for Qwen2.5 now @ PR 60
+@FRAMEWORK_REGISTRY.register("QwenAdapter")
+class Qwen_Adapter(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.phase = self.config.framework.action_model.get("phase", "Training")
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        self.config.framework.qwenvl.vl_hidden_dim = self.qwen_vl_interface.model.config.hidden_size
+        self.action_query_num = self.config.framework.action_model.get("action_query_num", 64)
+        self.action_model: VLA_Adapter_L1RegressionActionHead = get_action_model(config=self.config)
+        self.action_query = nn.Parameter(torch.randn(self.action_query_num, self.qwen_vl_interface.model.config.hidden_size))
+        self.dummy_action_token = "🔍" # TODO also can add spacail token to Qwen, but too complex
+        self.dummy_action_token_id = self.qwen_vl_interface.processor.tokenizer("🔍", add_special_tokens=False)["input_ids"][0]
+        self.dummy_action_prompt = self.dummy_action_token * self.action_query_num
+        self.chunk_len = self.config.framework.action_model.get("num_actions_chunk", None)
+        if self.chunk_len is None:
+            raise ValueError("num_actions_chunk must be specified in action_model config.")
+        if self.config.framework.action_model.get("use_proprio", False):
+            self.proprio_projector = ProprioProjector(
+                llm_dim=self.qwen_vl_interface.model.config.hidden_size,
+                proprio_dim=self.config.framework.action_model.get("state_dim", 14),
+            )
+        else:
+            self.proprio_projector = None
+        nn.init.normal_(self.action_query, mean=0.0, std=0.02)
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+
+        """
+        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        gt_actions = [example["action"] for example in examples]  # label [B， len, 7]
+        
+        # # debug print 
+        # print(f'gt action shape is {np.array(gt_actions).shape}')
+        # raise NotImplementedError("Debug stop here.")
+        
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+        # ! often state is None 
+        # ============================================================
+        # FIX: Insert action placeholder tokens BEFORE tokenization
+        # ============================================================
+        
+        # Append to instruction text (will be tokenized naturally)
+        prompt_suffix = f" Please predict the next {self.chunk_len} robot actions: <action>{self.dummy_action_prompt}<action>."
+        instructions = [instruction + prompt_suffix for instruction in instructions]
+        
+        # Step 1: Build Qwen-VL inputs with modified instructions
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images, 
+            instructions=instructions
+        )
+        # Now: [BOS, text, <img>, more_text, 🔍, 🔍, ..., 🔍, EOS]
+        #                                    ^^^^^^^^^^^^^^^^
+        #                                    Action placeholders BEFORE EOS
+        # Create mask for action token positions
+        input_ids = qwen_inputs['input_ids']
+        action_mask = (input_ids == self.dummy_action_token_id)  # [B, L]
+            
+        # ============================================================
+        # Hook to replace action token embeddings (OPTIMIZED)
+        # ============================================================
+        # Pre-compute action positions outside the hook
+        batch_size = qwen_inputs['input_ids'].shape[0]
+        device = qwen_inputs['input_ids'].device
+        action_positions_tensor = torch.full((batch_size, self.action_query_num), 0, dtype=torch.long, device=device)
+        valid_counts = torch.zeros(batch_size, dtype=torch.bool, device=device)
+
+        for b in range(batch_size):
+            act_pos = torch.where(action_mask[b])[0]
+            if len(act_pos) == self.action_query_num:
+                action_positions_tensor[b] = act_pos
+                valid_counts[b] = True
+
+        def inject_query_hook(module, inputs, output):
+            """Replace action placeholder embeddings with learnable queries (VECTORIZED)."""
+            query_embed = self.action_query.to(dtype=output.dtype, device=output.device)  # [N, H]
+            
+            # Vectorized replacement using advanced indexing
+            batch_indices = torch.arange(batch_size, device=output.device).unsqueeze(1).expand(-1, self.action_query_num)  # [B, N]
+            
+            # Only update valid samples (where action token count matches)
+            valid_batch_indices = batch_indices[valid_counts]
+            valid_action_positions = action_positions_tensor[valid_counts]
+            
+            if len(valid_batch_indices) > 0:
+                output[valid_batch_indices, valid_action_positions, :] = query_embed.unsqueeze(0)
+            
+            return output
+        # Register hook on text embedding layer (this is OK!)
+        embedding_layer = self.qwen_vl_interface.model.model.get_input_embeddings()
+        hook_handle = embedding_layer.register_forward_hook(inject_query_hook)
+        try:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                qwenvl_outputs = self.qwen_vl_interface(
+                    **qwen_inputs,
+                    output_attentions=False,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+        finally:
+            hook_handle.remove()
+                
+        hidden_states = qwenvl_outputs.hidden_states # list of [B, L, H]
+        # ============================================================
+        # Extract features (FULLY VECTORIZED)
+        # ============================================================
+        multi_layer_hidden_states = []
+        num_images, first_index_per_sample, last_index_per_sample = get_image_token_counts(qwen_inputs)
+        
+        max_patch_len = -999
+        for b in range(batch_size):
+            sample_patch_len = last_index_per_sample[b] - first_index_per_sample[b] + 1
+            if sample_patch_len > max_patch_len:
+                max_patch_len = sample_patch_len.item()
+        
+        for layer_hidden in hidden_states[0:]:
+            # layer_hidden: [B, L, H]
+            
+            # ============================================================
+            # 1. Vision Features (Fully Vectorized)
+            # ============================================================
+            # Create batch of indices [B, max_patch_len]
+            batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, max_patch_len)  # [B, max_patch_len]
+            seq_indices = torch.arange(max_patch_len, device=device).unsqueeze(0).expand(batch_size, -1)  # [B, max_patch_len]
+
+            # Add first_index_per_sample offset to get actual positions
+            seq_indices = seq_indices + first_index_per_sample.unsqueeze(1)  # [B, max_patch_len]
+
+            # Clamp to valid range (shouldn't exceed last_index_per_sample)
+            seq_indices = torch.clamp(seq_indices, max=last_index_per_sample.unsqueeze(1))  # [B, max_patch_len]
+
+            # Advanced indexing to extract vision features
+            batch_vision_states = layer_hidden[batch_indices, seq_indices, :]  # [B, max_patch_len, H]
+
+            # Mask padding - now based on actual vision patch lengths per sample
+            vision_patch_lengths = last_index_per_sample - first_index_per_sample + 1  # [B]
+            padding_mask = torch.arange(max_patch_len, device=device).unsqueeze(0) >= vision_patch_lengths.unsqueeze(1)  # [B, max_patch_len]
+            batch_vision_states = batch_vision_states.masked_fill(padding_mask.unsqueeze(-1), 0.0)
+            
+            # ============================================================
+            # 2. Action Query Features (Fully Vectorized)
+            # ============================================================
+            # Use advanced indexing
+            # When you index with two tensors in the first two dims, PyTorch treats them as matching coordinates:
+            # batch_indices_action is shape [B, N]
+            # action_positions_tensor is shape [B, N]
+            
+            batch_indices_action = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, self.action_query_num)  # [B, N]
+            action_query_states = layer_hidden[batch_indices_action, action_positions_tensor, :]  # [B, action_query_num, H]
+            
+            # ============================================================
+            # 3. Concatenate
+            # ============================================================
+            all_hidden_states = torch.cat([
+                batch_vision_states.unsqueeze(1),  # [B, 1, max_patch_len, H]
+                action_query_states.unsqueeze(1)   # [B, 1, action_query_num, H]
+            ], dim=2)  # [B, 1, L_total, H]
+            
+            multi_layer_hidden_states.append(all_hidden_states)
+
+        multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim=1)  # [B, num_layers, L_total, H]
+        state_projected = None
+        if state is not None: # repeat state 
+            state = torch.tensor(
+                    np.array(state), device=multi_layer_hidden_states.device, dtype=multi_layer_hidden_states.dtype
+                ) #  [B, 1, state_dim]
+            if self.proprio_projector is not None:
+                state_projected = self.proprio_projector(proprio=state.squeeze(1))  # [B, llm_dim]
+
+        # Step 3: Action Expert Forward
+        self.action_model = self.action_model.to(device=multi_layer_hidden_states.device, dtype=multi_layer_hidden_states.dtype)
+        predicted_actions = self.action_model.predict_action(
+            multi_layer_hidden_states,
+            vision_hidden_len=max_patch_len,
+            state_projected=state_projected,
+            phase=self.phase,
+        ) # (B, chunk_len, action_dim)
+
+        gt_actions = torch.tensor(np.stack(gt_actions)).to(
+            device=predicted_actions.device, 
+            dtype=predicted_actions.dtype
+        )
+
+        loss = torch.nn.L1Loss()(predicted_actions, gt_actions)
+
+        return {"action_loss": loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict] = None,
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        Inference: Predict future continuous actions aligned with the Forward logic (Hook + Multi-layer states).
+        
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Insert action placeholder tokens into instruction
+          3. Encode with QwenVL (hidden states retained) with hook to inject action queries
+          4. Extract multi-layer features at action query positions
+          5. Predict actions via action model
+          6. Return normalized action trajectory
+        
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, chunk_len, action_dim], predicted normalized actions.
+        """
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+    
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+        
+        # ============================================================
+        # Insert action placeholder tokens into instruction
+        # ============================================================
+        prompt_suffix = f" Please predict the next {self.chunk_len} robot actions: <action>{self.dummy_action_prompt}<action>."
+        instructions = [instruction + prompt_suffix for instruction in instructions]
+        
+        # Step 1: Build Qwen-VL inputs with modified instructions
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images, 
+            instructions=instructions
+        )
+        
+        # Create mask for action token positions
+        input_ids = qwen_inputs['input_ids']
+        action_mask = (input_ids == self.dummy_action_token_id)  # [B, L]
+        
+        # ============================================================
+        # Hook to replace action token embeddings (OPTIMIZED)
+        # ============================================================
+        # Pre-compute action positions outside the hook
+        batch_size = qwen_inputs['input_ids'].shape[0]
+        device = qwen_inputs['input_ids'].device
+        action_positions_tensor = torch.full((batch_size, self.action_query_num), 0, dtype=torch.long, device=device)
+        valid_counts = torch.zeros(batch_size, dtype=torch.bool, device=device)
+
+        for b in range(batch_size):
+            act_pos = torch.where(action_mask[b])[0]
+            if len(act_pos) == self.action_query_num:
+                action_positions_tensor[b] = act_pos
+                valid_counts[b] = True
+
+        def inject_query_hook(module, inputs, output):
+            """Replace action placeholder embeddings with learnable queries (VECTORIZED)."""
+            query_embed = self.action_query.to(dtype=output.dtype, device=output.device)  # [N, H]
+            
+            # Vectorized replacement using advanced indexing
+            batch_indices = torch.arange(batch_size, device=output.device).unsqueeze(1).expand(-1, self.action_query_num)  # [B, N]
+            
+            # Only update valid samples (where action token count matches)
+            valid_batch_indices = batch_indices[valid_counts]
+            valid_action_positions = action_positions_tensor[valid_counts]
+            
+            if len(valid_batch_indices) > 0:
+                output[valid_batch_indices, valid_action_positions, :] = query_embed.unsqueeze(0)
+            
+            return output
+        # Register hook on text embedding layer (this is OK!)
+        embedding_layer = self.qwen_vl_interface.model.model.get_input_embeddings()
+        hook_handle = embedding_layer.register_forward_hook(inject_query_hook)
+        try:
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                qwenvl_outputs = self.qwen_vl_interface(
+                    **qwen_inputs,
+                    output_attentions=False,
+                    output_hidden_states=True,
+                    return_dict=True,
+                )
+        finally:
+            hook_handle.remove()
+                
+        hidden_states = qwenvl_outputs.hidden_states # list of [B, L, H]
+        # ============================================================
+        # Extract features (FULLY VECTORIZED)
+        # ============================================================
+        multi_layer_hidden_states = []
+        num_images, first_index_per_sample, last_index_per_sample = get_image_token_counts(qwen_inputs)
+        
+        max_patch_len = -999
+        for b in range(batch_size):
+            sample_patch_len = last_index_per_sample[b] - first_index_per_sample[b] + 1
+            if sample_patch_len > max_patch_len:
+                max_patch_len = sample_patch_len.item()
+        
+        for layer_hidden in hidden_states[0:]:
+            # layer_hidden: [B, L, H]
+            
+            # ============================================================
+            # 1. Vision Features (Fully Vectorized)
+            # ============================================================
+            # Create batch of indices [B, max_patch_len]
+            batch_indices = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, max_patch_len)  # [B, max_patch_len]
+            seq_indices = torch.arange(max_patch_len, device=device).unsqueeze(0).expand(batch_size, -1)  # [B, max_patch_len]
+
+            # Add first_index_per_sample offset to get actual positions
+            seq_indices = seq_indices + first_index_per_sample.unsqueeze(1)  # [B, max_patch_len]
+
+            # Clamp to valid range (shouldn't exceed last_index_per_sample)
+            seq_indices = torch.clamp(seq_indices, max=last_index_per_sample.unsqueeze(1))  # [B, max_patch_len]
+
+            # Advanced indexing to extract vision features
+            batch_vision_states = layer_hidden[batch_indices, seq_indices, :]  # [B, max_patch_len, H]
+
+            # Mask padding - now based on actual vision patch lengths per sample
+            vision_patch_lengths = last_index_per_sample - first_index_per_sample + 1  # [B]
+            padding_mask = torch.arange(max_patch_len, device=device).unsqueeze(0) >= vision_patch_lengths.unsqueeze(1)  # [B, max_patch_len]
+            batch_vision_states = batch_vision_states.masked_fill(padding_mask.unsqueeze(-1), 0.0)
+            
+            # ============================================================
+            # 2. Action Query Features (Fully Vectorized)
+            # ============================================================
+            # Use advanced indexing
+            # When you index with two tensors in the first two dims, PyTorch treats them as matching coordinates:
+            # batch_indices_action is shape [B, N]
+            # action_positions_tensor is shape [B, N]
+            
+            batch_indices_action = torch.arange(batch_size, device=device).unsqueeze(1).expand(-1, self.action_query_num)  # [B, N]
+            action_query_states = layer_hidden[batch_indices_action, action_positions_tensor, :]  # [B, action_query_num, H]
+            
+            # ============================================================
+            # 3. Concatenate
+            # ============================================================
+            all_hidden_states = torch.cat([
+                batch_vision_states.unsqueeze(1),  # [B, 1, max_patch_len, H]
+                action_query_states.unsqueeze(1)   # [B, 1, action_query_num, H]
+            ], dim=2)  # [B, 1, L_total, H]
+            
+            multi_layer_hidden_states.append(all_hidden_states)
+            
+        multi_layer_hidden_states = torch.cat(multi_layer_hidden_states, dim=1)  # [B, num_layers, L_total, H]
+        state_projected = None
+        if state is not None: # repeat state 
+            state = torch.tensor(
+                    np.array(state), device=multi_layer_hidden_states.device, dtype=multi_layer_hidden_states.dtype
+                ) #  [B, 1, state_dim]
+            if self.proprio_projector is not None:
+                state_projected = self.proprio_projector(proprio=state.squeeze(1))  # [B, llm_dim]
+        
+        # ============================================================
+        # Action prediction
+        # ============================================================
+        with torch.autocast("cuda", dtype=torch.float32):
+            self.action_model = self.action_model.to(
+                device=multi_layer_hidden_states.device, 
+                dtype=multi_layer_hidden_states.dtype
+            )
+            predicted_actions = self.action_model.predict_action(
+                multi_layer_hidden_states,
+                vision_hidden_len=max_patch_len,
+                state_projected=state_projected,
+                phase=self.phase,
+            )  # (B, chunk_len, action_dim)
+        
+        normalized_actions = predicted_actions.detach().cpu().numpy()
+        return {"normalized_actions": normalized_actions} 
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_train_adapter.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen2.5-VL-3B-Instruct"
+    
+    model: Qwen_Adapter = Qwen_Adapter(cfg)
+    print(model)
+
+
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 14)).astype(np.float16), # action_chunk, action_dim
+        "image": [image, image], # two views
+        "lang": "This is a fake for testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 14)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action(examples=[batch[0]])
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+    # # 
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
+
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+
+    # # fake state
+    # for ba in batch:
+    #     ba["state"] = ba["action"][0][None]
+
+    # model(batch)
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])
diff --git a/code/model/framework/QwenDual.py b/code/model/framework/QwenDual.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcebbffe53b1c92a0e9e2515a177a40cf4d98ebc
--- /dev/null
+++ b/code/model/framework/QwenDual.py
@@ -0,0 +1,288 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025]. 
+
+"""
+Qwen-Dual Framework
+A lightweight implementation that Qwen2.5-vl + dinov2 + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5
+"""
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+
+from starVLA.model.modules.dino_model.dino import get_dino_model
+from starVLA.training.trainer_utils import initialize_overwatch
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.GR00T_ActionHeader import get_action_model, FlowmatchingActionHead
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+
+@FRAMEWORK_REGISTRY.register("QwenDual")
+class Qwen_Dual(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        # align dims --> we should put them to config or no?
+        self.config.framework.action_model.diffusion_model_cfg.cross_attention_dim = self.qwen_vl_interface.model.config.hidden_size
+
+        self.action_model: FlowmatchingActionHead = get_action_model(config=self.config)  # 修复后续引用
+
+        self.dino_encoder = get_dino_model(
+            backone_name=getattr(self.config.framework.dino, "dino_backbone", "dinov2_vits14")
+        )
+        self.dino_pro = nn.Linear(
+            in_features=self.dino_encoder.num_channels, out_features=self.qwen_vl_interface.model.config.hidden_size
+        )
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+        
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+        训练前向：直接回归未来动作（无扩散）。
+
+        Flow:
+          1. Build QwenVL inputs (images + instruction tokens)
+          2. Extract hidden states from configured layer range
+          7. Predict action and compute L1 loss
+
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+            **kwargs: Reserved.
+
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images, wrist_views, instructions, state = self.align_model_input(examples)
+        last_hidden, state = self.get_action_condition(batch_images, instructions, wrist_views, state)
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            # get action labels
+            actions = [example["action"] for example in examples]  # List of [T_full, action_dim]
+            actions = torch.tensor(
+                np.array(actions), device=last_hidden.device, dtype=last_hidden.dtype
+            )  # [B, T, action_dim]
+            actions_target = actions[:, -(self.future_action_window_size+1):, :]  # (B, chunk_len, action_dim)
+
+            # repeate for efficient training
+            repeated_diffusion_steps = (
+                self.config.trainer.get("repeated_diffusion_steps", 4) if self.config and self.config.trainer else 4
+            )
+            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+            last_hidden_repeated = last_hidden.repeat(repeated_diffusion_steps, 1, 1)
+            state_repeated = None
+            if state is not None:
+                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)
+            action_loss = self.action_model(last_hidden_repeated, actions_target_repeated, state_repeated)  # (B, chunk_len, action_dim)
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict] = None,
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        batch_images, wrist_views, instructions, state = self.align_model_input(examples)
+        last_hidden, state = self.get_action_condition(batch_images, instructions, wrist_views, state)
+        # Step 4: Action Expert Forward
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.predict_action(last_hidden, state)  # (B, chunk_len, action_dim)
+        normalized_actions = pred_actions.detach().cpu().numpy()
+
+        return {"normalized_actions": normalized_actions}
+    
+    def align_model_input(self, examples: List[dict]):
+
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        wrist_views = [to_pil_preserve(example["wrist_views"]) for example in examples] if "wrist_views" in examples[0] else None #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+  
+    
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", [224,224])
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+        if train_obs_image_size and wrist_views is not None:
+            wrist_views = resize_images(wrist_views, target_size=train_obs_image_size)
+            
+        return batch_images, wrist_views, instructions, state
+    
+    def get_action_condition(self, batch_images, instructions, wrist_views=None, state=None):
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # last_hidden_state: [B, seq_len, H]
+            connect_layer_index = self.config.framework.action_model.get("connect_layer_index", -1)
+            last_hidden = qwenvl_outputs.hidden_states[connect_layer_index]   # [B, L, H]
+            
+            # Step 2: DINO Forward
+            if wrist_views == None:
+                wrist_views = batch_images
+            image_tensors = self.dino_encoder.prepare_dino_input(wrist_views)  #
+            B = len(batch_images)
+            dino_features = self.dino_encoder(image_tensors)  # DINO output is [B*num_view, token, dim]
+            dino_encoded_features = dino_features.reshape(B, -1, dino_features.shape[-1])  # [B, num_view * token, dim]
+            dino_encoded_features = self.dino_pro(dino_encoded_features)  # [B, num_view * token, hidden_size]
+
+            # Step 3: Feature Concatenation
+            last_hidden = torch.cat(
+                    [last_hidden, dino_encoded_features], dim=1
+                )
+        state = torch.from_numpy(np.array(state)).to(last_hidden.device, dtype=last_hidden.dtype) if state is not None else None
+        
+        return last_hidden, state
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+    # cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct"
+    # # cfg.framework.action_model.connect_layer_index = 16
+    # cfg.framework.action_model.state_dim = 44
+    # cfg.datasets.vla_data.include_state = True
+
+    cfg.framework.action_model.action_hidden_dim = 2048
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Florence-2-large"
+    
+    model: Qwen_Dual = Qwen_Dual(cfg)
+    print(model)
+
+
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image], # three views
+        # "wrist_views": [image, image],
+        "lang": "Put all the toys in the child's room - the three board games (two on the bed and one on the table), the two jigsaw puzzles on the table, and the tennis ball on the table - inside the toy box on the table in the child's room.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 44)).astype(np.float16), # chunk, state_dim
+    }
+    
+    sample2 = sample.copy()
+    sample2["lang"] = "Move the red cup from the table to the kitchen counter next to the sink."
+    batch  = [sample, sample2]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action([sample]) #, state=[batch[0]["state"]]
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    vla_dataset_cfg = cfg.datasets.vla_data
+    # vla_dataset_cfg.include_state = True
+    # vla_dataset_cfg.data_mix = "BEHAVIOR_challenge"
+    # vla_dataset_cfg.data_mix = "BEHAVIOR_rgp_dual_history"
+    vla_dataset_cfg.task_id = 40
+    vla_dataset_cfg.video_backend = "torchvision_av"
+    dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1,  # For Debug
+        collate_fn=collate_fn,
+    )
+    # 
+    count = 0
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        batch
+        count += 1
+        if count > 1:
+            break
+
+    # try get model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model(batch)
+
+    action = model.predict_action(examples=[sample]) #, state=[batch[0]["state"]]
diff --git a/code/model/framework/QwenFast.py b/code/model/framework/QwenFast.py
new file mode 100644
index 0000000000000000000000000000000000000000..47451b5e9bc9c7d7a9223c25d43cf762dcce7fe3
--- /dev/null
+++ b/code/model/framework/QwenFast.py
@@ -0,0 +1,307 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025]. 
+
+"""
+Qwen-Fast Framework
+
+A lightweight implementation for autoregressive discrete action prediction conditioned on multi-view images + instruction.
+fast tokenizer is copyright from physical-intelligence/fast
+
+Key Points:
+  - Qwen2.5 vision-language backbone
+  - Unified action learning via next-token prediction (fast tokenizer)
+  - Autoregressive action tokens derived from discretized / symbolized continuous actions
+
+Note: How to add special tokens to Qwen2.5:
+  download our model checkpoint with special tokens added: https://huggingface.co/StarVLA/Qwen2.5-VL-3B-Instruct-Action
+"""
+
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple, Any
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+from qwen_vl_utils import process_vision_info
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.fast_ActionHeader import get_action_model
+
+
+@FRAMEWORK_REGISTRY.register("QwenFast")
+class Qwenvl_Fast(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        self.action_model = get_action_model(config=self.config)
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+        # self.hidden_dim = config.framework.action_model.action_hidden_dim
+        
+        self.action_model.fast_tokenizer.time_horizon = self.future_action_window_size + 1
+        self.action_model.fast_tokenizer.action_dim = self.config.framework.action_model.action_dim
+
+        
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+        Training forward: directly predict future actions via next-token prediction (no diffusion).
+
+        Flow:
+          1. Build QwenVL inputs (images + instruction tokens)
+          2. Extract hidden states from configured layer range
+          7. Predict action and compute L1 loss
+
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+            **kwargs: Reserved.
+
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  #  [B, [PIL]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B, len, 7]
+        
+        # step 0: map_raw_action_to_vlm_action
+        batch_fast_tokens = self.action_model.encoder_action2fastoken(actions)  # List[str]
+
+        # batch_fast_tokens = [self.fast_tokenizer(raw_action)[0] for raw_action in raw_actions]
+        vlm_action_tokens = [self.map_fast_token_to_vlm_action(fast_tokens) for fast_tokens in batch_fast_tokens]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions, solutions=vlm_action_tokens)
+        
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=False,
+                return_dict=True,
+            )
+        
+        vlm_action_loss = qwenvl_outputs.loss
+        if vlm_action_loss is None or torch.isnan(vlm_action_loss): 
+            vlm_action_loss = torch.tensor(0.0, device=self.qwen_vl_interface.model.device)
+
+        return {"action_loss": vlm_action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict] = None,
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        Inference: single forward pass to obtain future actions (no diffusion sampling).
+        # can be batch forward
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        if type(examples) is not list:
+            examples = [examples]
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+    
+        # train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        # if train_obs_image_size:
+        #     batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+        instructions = [instruction for instruction in instructions]
+
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            generated_ids = self.qwen_vl_interface.model.generate(
+                **qwen_inputs,
+                max_length=2048,
+            )
+        # --- Extract and decoder vlm_action to continue actions ---
+        # --- extrace token (index based on VLM) ---
+        batch_vlm_action_token_ids = self._extract_action_token_ids(generated_ids)
+        # --- map index to fast tokenizer index space ---
+        batch_fast_action_token_idx = self._decode_action_tokens(batch_vlm_action_token_ids)
+        # --- decode fast tokenizer index to action semantic ---
+        normalized_actions = self.action_model.fast_tokenizer.decode(batch_fast_action_token_idx)
+
+        return {"normalized_actions": normalized_actions}
+
+    def _extract_action_token_ids(
+        self,
+        generated_ids: torch.LongTensor,
+    ) -> List[List[int]]:
+        """
+        Extract action tokens (with offset) from the generated token sequence and return a 2D list:
+        ret[b] = [vlm_action_token_id_0, vlm_action_token_id_1, ...]
+        Rule: keep all tokens falling within [_ACTION_TOKEN_MIN, _ACTION_TOKEN_MAX] in order of appearance.
+        You may change it to "take only the first occurrence followed by continuous segment" as needed.
+        """
+        act_min = self.qwen_vl_interface._ACTION_TOKEN_MIN
+        act_max = self.qwen_vl_interface._ACTION_TOKEN_MAX
+        mask = (generated_ids >= act_min) & (generated_ids <= act_max)  # [B, L]
+        results = []
+        for b in range(generated_ids.size(0)):
+            idx = mask[b].nonzero(as_tuple=False).flatten()
+            if idx.numel() == 0:
+                results.append([])
+                continue
+            # all action tokens
+            tokens = generated_ids[b, idx].tolist()
+            results.append(tokens)
+        return results
+
+    def _decode_action_tokens(self, batch_vlm_tokens: List[List[int]]) -> List[Any]:
+        """
+        Decode the offset VLM action token list back to fast tokenizer semantics.
+        fast_tokenizer.decode expects the original fast token id sequence (without offset).
+        """
+        act_min = self.qwen_vl_interface._ACTION_TOKEN_MIN
+        batch_fast_token_ids = []
+        for seq in batch_vlm_tokens:
+            if not seq:
+                batch_fast_token_ids.append(None)
+                continue
+            fast_ids = [t - act_min for t in seq]
+            
+            batch_fast_token_ids.append(fast_ids)
+        
+        return batch_fast_token_ids
+
+    def map_fast_token_to_vlm_action(self, tokens) -> str:
+        """Maps fast action tokens to the VLM action format.
+        Action token 0 is mapped to the string <robot_action_0>  ... and so on 
+        """
+        return ''.join([f"<robot_action_{token}>" for token in tokens]) # you should add <robot_action_{token}> to VLM as special tokens, 
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+    args.config_yaml = "./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml"
+    cfg = OmegaConf.load(args.config_yaml)
+    # cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct-Action"
+
+    # try get model
+    model = Qwenvl_Fast(cfg)
+    print(model)
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 14)).astype(np.float16), # action_chunk, action_dim
+        "image": [image, image], # two views
+        "lang": "This is a fake instruction for testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    sample2 = {
+        "action": np.random.uniform(-1, 1, size=(16, 14)).astype(np.float16), # action_chunk, action_dim
+        "image": [image, image], # two views
+        "lang": "The fake instruction for testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample2]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action. for new model, it didn't learn to predict action token, so you would meet empty action
+    predict_output = model.predict_action([sample])
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+
+
+
+    # # test with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    vla_dataset_cfg = cfg.datasets.vla_data
+    vla_dataset_cfg.video_backend = "torchvision_av"
+    dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1,  # For Debug
+        collate_fn=collate_fn,
+    )
+    
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        batch
+        break
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model(batch)
+    pass
+    action = model.predict_action(batch[0])
diff --git a/code/model/framework/QwenGR00T.py b/code/model/framework/QwenGR00T.py
new file mode 100644
index 0000000000000000000000000000000000000000..34b57f0a4a918c9c1cb4bb56986116bc1f7ee722
--- /dev/null
+++ b/code/model/framework/QwenGR00T.py
@@ -0,0 +1,353 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Junqiu YU / Fudan University] in [2025]. 
+# Design and Merged by [Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GR00T Framework
+A lightweight implementation that Qwen-VL + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5,
+"""
+import sys
+from pathlib import Path
+
+# Add workspace root to Python path if not already there
+_workspace_root = Path(__file__).parent.parent.parent.parent
+if str(_workspace_root) not in sys.path:
+    sys.path.insert(0, str(_workspace_root))
+
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.GR00T_ActionHeader import get_action_model, FlowmatchingActionHead
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+@FRAMEWORK_REGISTRY.register("QwenGR00T")
+class Qwen_GR00T(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        llm_hidden_size = self.qwen_vl_interface.model.config.hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        # align dims --> we should put them to config or no?
+        self.config.framework.action_model.diffusion_model_cfg.cross_attention_dim = llm_hidden_size
+
+        self.action_model: FlowmatchingActionHead = get_action_model(config=self.config)  # 修复后续引用
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+
+        # Dataset soft prompt: conditions VLM on dataset identity
+        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 0)
+        if self.num_data_tokens > 0:
+            self.dataset_embed = nn.Embedding(
+                self.dataset_vocab_size,
+                llm_hidden_size * self.num_data_tokens,
+            )
+        
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+
+        """
+        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B， len, 7]
+        
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
+                )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]   # [B, L, H]
+
+        # Step 4: Action Expert Forward and Loss
+        # Extract encoder_attention_mask before VLM forward (qwen_inputs still in scope).
+        # In cross-embodied training, batch sequences have very different lengths due to
+        # varying camera counts (different image token counts per environment). Without
+        # masking, the DiT cross-attention attends to padding tokens, injecting
+        # task-dependent noise that causes unstable performance across environments.
+        encoder_attention_mask = qwen_inputs.get("attention_mask", None)
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            actions = torch.tensor(
+                np.array(actions), device=last_hidden.device, dtype=last_hidden.dtype
+            )  # [B, T_full, action_dim]
+            actions_target = actions[:, -(self.future_action_window_size+1):, :]  # (B, chunk_len, action_dim)
+
+            repeated_diffusion_steps = (
+                self.config.trainer.get("repeated_diffusion_steps", 4) if self.config and self.config.trainer else 4
+            )
+            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+            last_hidden_repeated = last_hidden.repeat(repeated_diffusion_steps, 1, 1)
+            encoder_attention_mask_repeated = (
+                encoder_attention_mask.repeat(repeated_diffusion_steps, 1)
+                if encoder_attention_mask is not None else None
+            )
+
+            state_repeated = None
+            if state is not None:
+                state = torch.tensor(
+                    np.array(state), device=last_hidden.device, dtype=last_hidden.dtype
+                )
+                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)
+
+            action_loss = self.action_model(
+                last_hidden_repeated,
+                actions_target_repeated,
+                state_repeated,
+                encoder_attention_mask=encoder_attention_mask_repeated,
+            )
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict],
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        if type(examples) is not list:
+            examples = [examples]
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+    
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+    
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
+                )
+
+        encoder_attention_mask = qwen_inputs.get("attention_mask", None)
+        # encoder_attention_mask = None
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]   # [B, L, H]
+
+        state = torch.from_numpy(np.array(state)).to(last_hidden.device, dtype=last_hidden.dtype) if state is not None else None
+        
+        # Step 4: Action Expert Forward
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.predict_action(
+                last_hidden, state, encoder_attention_mask=encoder_attention_mask
+            )
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        return {"normalized_actions": normalized_actions}
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./examples/Robotwin/train_files/starvla_cotrain_robotwin.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+    args.config_yaml = "examples/MultiRobot/train_files/starvla_cotrain_multiRobot.yaml"
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+    # cfg.framework.action_model.action_hidden_dim = 2048
+
+    # cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Florence-2-large"
+    
+
+    model: Qwen_GR00T = Qwen_GR00T(cfg)
+    print(model)
+
+
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image], # three views
+        "lang": "Put all the toys in the child's room - the three board games (two on the bed and one on the table), the two jigsaw puzzles on the table, and the tennis ball on the table - inside the toy box on the table in the child's room.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+    sample2 = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image], # three views
+        "lang": "Put all the toys in the child's room - the three board games (two on the bed and one on the table), the two jigsaw puzzles on the table, and the tennis ball on the table - inside the toy box on the table in the child's room.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample2]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action(examples=[sample]) #, state=[batch[0]["state"]]
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    vla_dataset_cfg = cfg.datasets.vla_data
+    from torch.utils.data import DataLoader
+    from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+    cfg.datasets.vla_data.include_state = "False"
+    dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1,  # For Debug
+        collate_fn=collate_fn,
+    )
+    # forward model with dataloader
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        # try get model
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        model(batch)
+        # break
+
+    action = model.predict_action(examples=batch)
+    print("Finished")
diff --git a/code/model/framework/QwenLatent copy.py b/code/model/framework/QwenLatent copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7b62080448be958970e7b38ae0b688000fde21a
--- /dev/null
+++ b/code/model/framework/QwenLatent copy.py	
@@ -0,0 +1,520 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GROOT Framework
+A lightweight implementation that Qwen2.5-vl + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5, but a sample MoE inspired by PI_0
+"""
+import sys
+sys.path.append("/mnt/data/fangyu/code/rewardmodel")
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import copy
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+from transformers import AutoImageProcessor, AutoModel
+from omegaconf import OmegaConf
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.ActionModel_FM import ActionModelFM
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+from starVLA.dataloader.gr00t_lerobot.datasets import ACTION_REPRESENTATION_SLICES
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+####################################################
+# ⚠️ Warning: This framework has been restructured and is NOT compatible with checkpoints created before 2025-10-20.
+####################################################
+
+@FRAMEWORK_REGISTRY.register("QwenLatent")
+class QwenLatent(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise cross DiT diffusion head
+
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    @staticmethod
+    def _get_last_nonpad_indices(attention_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Return the index of the last non-padding token for each sequence.
+
+        Works for both tokenizer.padding_side == "left" and "right".
+        attention_mask: [B, T] with 1/True for real tokens and 0/False for pads.
+        """
+        if attention_mask is None:
+            raise ValueError("attention_mask cannot be None")
+        if attention_mask.dim() != 2:
+            raise ValueError(f"attention_mask must be 2D [B,T], got shape {tuple(attention_mask.shape)}")
+
+        # Find distance-from-end to last 1 by reversing sequence dimension.
+        # Example:
+        # - left pad:  [0,0,1,1,1] -> flip -> [1,1,1,0,0] -> argmax = 0 -> last = T-1
+        # - right pad: [1,1,1,0,0] -> flip -> [0,0,1,1,1] -> argmax = 2 -> last = T-1-2 = 2
+        mask = attention_mask.to(dtype=torch.long)
+        rev_first_one = torch.flip(mask, dims=[1]).argmax(dim=1)
+        last_nonpad = mask.size(1) - 1 - rev_first_one
+        return last_nonpad
+
+    #
+    def __init__(
+            self,
+            config: Optional[dict] = None,
+            **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        # dynamic get llm config
+        num_vl_layers, llm_hidden_size = 36, self.qwen_vl_interface.model.config.hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
+        self.config.framework.qwenvl.num_vl_layers = num_vl_layers
+
+        action_model_cfg = getattr(self.config.framework, "action_model", None)
+        if action_model_cfg is not None:
+            action_model_kwargs = OmegaConf.to_container(action_model_cfg, resolve=True)
+            print(f"{action_model_kwargs=}")
+            self.action_model = ActionModelFM(ActionModelConfig(**action_model_kwargs))
+        else:
+            self.action_model = ActionModelFM(ActionModelConfig())
+        ckpt_path = getattr(self.config.framework.action_model, "ckpt_path", None)
+        if ckpt_path:
+            self.action_model.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=True)
+            print(f"✅ loaded action model from {ckpt_path}")
+        print(f"action model loss mode: {self.action_model.config.loss_mode}")
+        # Dataset soft prompt for QwenVL (conditioning on dataset_id)
+        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 32)
+        self.dataset_embed = nn.Embedding(
+            self.dataset_vocab_size,
+            llm_hidden_size * self.num_data_tokens,
+        )
+        # Learnable query token appended to VLM inputs (for action embedding)
+        self.query_token = nn.Parameter(torch.randn(1, 1, llm_hidden_size))
+
+        # 使用 MLP 投影器，增加表达能力（2048 → 2048 → 1024）
+        action_hidden_size = self.action_model.config.hidden_size
+        self.action_embed_projector = nn.Sequential(
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, action_hidden_size),
+        )
+
+        self.chunk_size = self.config.datasets.vla_data.chunk_size
+        self.num_history_steps = 0
+        self.use_state = self.action_model.use_state
+        
+    def _maybe_log_align_stats(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        gt_action_embeddings: torch.Tensor,
+    ) -> None:
+        if getattr(self, "_align_stats_logged", False):
+            return
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return
+        with torch.no_grad():
+            pred = predicted_action_embeddings.float()
+            gt = gt_action_embeddings.float()
+            pred_norm = pred.norm(dim=-1).mean().item()
+            gt_norm = gt.norm(dim=-1).mean().item()
+            logger.info(
+                "Align stats: pred(mean=%.4f,std=%.4f,avg_norm=%.4f) "
+                "gt(mean=%.4f,std=%.4f,avg_norm=%.4f)",
+                pred.mean().item(),
+                pred.std().item(),
+                pred_norm,
+                gt.mean().item(),
+                gt.std().item(),
+                gt_norm,
+            )
+        self._align_stats_logged = True
+
+    def forward(
+            self,
+            examples: List[dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  # [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B, L, action_dim]
+        states = [example["state"] for example in examples] if self.use_state else None  # [B, L, state_dim] when state_use_action_chunk
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions,
+            chunk_size=self.chunk_size,
+        )
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds, query_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"], query_mask), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                query_pos = (
+                    torch.full(
+                        (qwen_inputs["position_ids"].shape[0], 1),
+                        qwen_inputs["position_ids"].shape[1] + self.num_data_tokens,
+                        device=qwen_inputs["position_ids"].device,
+                        dtype=qwen_inputs["position_ids"].dtype,
+                    )
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens, query_pos), dim=1
+                )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+            last_hidden_states = qwenvl_outputs.hidden_states[-1]
+
+            if "attention_mask" in qwen_inputs:
+                # 找到非 padding 的最后一个 token index（兼容 left/right padding）
+                last_token_indices = self._get_last_nonpad_indices(qwen_inputs["attention_mask"])
+                batch_indices = torch.arange(last_hidden_states.shape[0], device=last_hidden_states.device)
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(action_token_hidden).float() # [B, Action_Hidden]
+            predicted_action_embeddings = F.normalize(predicted_action_embeddings, p=2, dim=-1)
+
+        # Step 2: Action Expert Forward and Loss
+        loss_mode = getattr(self.action_model.config, "loss_mode", "full")
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            actions_target = torch.as_tensor(np.array(actions), device=last_hidden_states.device, dtype=torch.float32)
+
+            B = actions_target.shape[0]
+            t = self.action_model._sample_fm_time(B, device=actions_target.device, dtype=actions_target.dtype)
+            noise = torch.randn_like(actions_target)
+
+            if loss_mode == "predict_only":
+                # Only predict_loss: skip align_loss and recon_loss
+                align_loss = None
+                recon_loss = None
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+            else:
+                # Full mode: align + recon + predict
+                # state chunk 与 action chunk 对齐（同长度）
+                states_target = None
+                if self.use_state:
+                    states_target = torch.as_tensor(np.array(states), device=last_hidden_states.device, dtype=torch.float32)
+
+                gt_action_embeddings = self.action_model.encode_actions(
+                    actions=actions_target,
+                    dataset_ids=dataset_ids,
+                    state=states_target,
+                )
+
+                self._maybe_log_align_stats(predicted_action_embeddings, gt_action_embeddings)
+
+                align_loss = F.l1_loss(predicted_action_embeddings, gt_action_embeddings.float().detach())
+                recon_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=gt_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+
+        return {
+            "align_loss": align_loss,
+            "recon_loss": recon_loss,
+            "predict_loss": predict_loss,
+        }
+
+    @torch.inference_mode()
+    def predict_action(  # TODO align  predict_action with forward, make api more flexible
+            self,
+            examples: List[dict] = None,
+            embodiment_tag: Optional[str] = None,
+            **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+
+        Args:
+            examples: List of example dicts containing image, lang, etc.
+            embodiment_tag: Optional embodiment tag (e.g., "franka", "oxe_rt1", "oxe_bridge").
+                          If provided, will extract valid action dimensions based on ACTION_REPRESENTATION_SLICES.
+                          If None, returns full unified action representation.
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+                                                 If embodiment_tag is provided, shape is [B, T, valid_dim] where
+                                                 valid_dim is determined by ACTION_REPRESENTATION_SLICES[embodiment_tag].
+        """
+        from deployment.model_server.tools.image_tools import to_pil_preserve
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  # [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+
+        dataset_ids = [example.get("dataset_id") for example in examples]
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions,
+        )
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds, query_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"], query_mask), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                query_pos = (
+                    torch.full(
+                        (qwen_inputs["position_ids"].shape[0], 1),
+                        qwen_inputs["position_ids"].shape[1] + self.num_data_tokens,
+                        device=qwen_inputs["position_ids"].device,
+                        dtype=qwen_inputs["position_ids"].dtype,
+                    )
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens, query_pos), dim=1
+                )
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            last_hidden_states = qwenvl_outputs.hidden_states[-1] 
+
+            if "attention_mask" in qwen_inputs:
+                # 找到非 padding 的最后一个 token index（兼容 left/right padding）
+                last_token_indices = self._get_last_nonpad_indices(qwen_inputs["attention_mask"])
+                batch_indices = torch.arange(last_hidden_states.shape[0], device=last_hidden_states.device)
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(action_token_hidden).float() # [B, Action_Hidden]
+            # L2 normalize before sending to decoder (consistent with training)
+            predicted_action_embeddings = F.normalize(predicted_action_embeddings, p=2, dim=-1)
+
+        # Step 4: 选择 decoder 进行推理
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.decode_actions(
+                predicted_action_embeddings, 
+                chunk_size=self.chunk_size
+            )
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        
+        # 如果提供了 embodiment_tag，根据 tag 提取有效的动作维度
+        if embodiment_tag is not None:
+            if embodiment_tag not in ACTION_REPRESENTATION_SLICES:
+                raise ValueError(
+                    f"Unknown embodiment tag '{embodiment_tag}'. "
+                    f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES.keys())}"
+                )
+            
+            # 获取对应的 slice
+            target_slice = ACTION_REPRESENTATION_SLICES[embodiment_tag]
+            
+            # 从统一表示中提取对应的维度
+            normalized_actions = normalized_actions[..., target_slice]
+        
+        return {"normalized_actions": normalized_actions}
+
+
+if __name__ == "__main__":
+
+    from omegaconf import OmegaConf
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str,
+                        default="/fsx/home/yfang/projects/LearnLatent/starVLA/config/training/starvla_train_qwenlatent_oxe.yaml",
+                        help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+
+
+    model = QwenLatent(cfg)
+    # ckpt="/mnt/petrelfs/yejinhui/Projects/llavavla/results/Checkpoints/1011_qwenpi/checkpoints/need_steps_10000_pytorch_model.pt"
+    # model = Qwen_PI.from_pretrained(ckpt)
+    print(model)
+
+    # fake sample
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(15, 7)).astype(np.float16),  # action_chunk, action_dim
+        "image": [image],  # two views
+        "image_past_half": [image],
+        "image_past_one": [image],
+        "image_future": [image],
+        "lang": "put the ball on the table",
+        "state": np.random.uniform(-1, 1, size=(1, 8)).astype(np.float16),  # chunk, state_dim
+    }
+
+    batch = [sample, sample]  # batch size 2
+    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    align_loss = forward_output['align_loss']
+    recon_loss = forward_output['recon_loss']
+    print(f"Align Loss: {align_loss.item()}")
+    print(f"Recon Loss: {recon_loss.item()}")
+
+    # # test predict action
+    # predict_output = model.predict_action([sample])
+    # normalized_actions = predict_output['normalized_actions']
+    # print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+    # #
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
+
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+
+    # # fake state
+    # for ba in batch:
+    #     ba["state"] = ba["action"][0][None]
+
+    # model(batch)
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])
\ No newline at end of file
diff --git a/code/model/framework/QwenLatent.py b/code/model/framework/QwenLatent.py
new file mode 100644
index 0000000000000000000000000000000000000000..56a0bf3d7ed142d78e22db8ac5f707fbda99c555
--- /dev/null
+++ b/code/model/framework/QwenLatent.py
@@ -0,0 +1,543 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GROOT Framework
+A lightweight implementation that Qwen2.5-vl + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5, but a sample MoE inspired by PI_0
+"""
+import sys
+sys.path.append("/mnt/data/fangyu/code/rewardmodel")
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import copy
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+from transformers import AutoImageProcessor, AutoModel
+from omegaconf import OmegaConf
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.ActionModel_FM import ActionModelFM
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+from starVLA.dataloader.gr00t_lerobot.datasets import ACTION_REPRESENTATION_SLICES
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+####################################################
+# ⚠️ Warning: This framework has been restructured and is NOT compatible with checkpoints created before 2025-10-20.
+####################################################
+
+@FRAMEWORK_REGISTRY.register("QwenLatent")
+class QwenLatent(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise cross DiT diffusion head
+
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    @staticmethod
+    def _get_last_nonpad_indices(attention_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Return the index of the last non-padding token for each sequence.
+
+        Works for both tokenizer.padding_side == "left" and "right".
+        attention_mask: [B, T] with 1/True for real tokens and 0/False for pads.
+        """
+        if attention_mask is None:
+            raise ValueError("attention_mask cannot be None")
+        if attention_mask.dim() != 2:
+            raise ValueError(f"attention_mask must be 2D [B,T], got shape {tuple(attention_mask.shape)}")
+
+        # Find distance-from-end to last 1 by reversing sequence dimension.
+        # Example:
+        # - left pad:  [0,0,1,1,1] -> flip -> [1,1,1,0,0] -> argmax = 0 -> last = T-1
+        # - right pad: [1,1,1,0,0] -> flip -> [0,0,1,1,1] -> argmax = 2 -> last = T-1-2 = 2
+        mask = attention_mask.to(dtype=torch.long)
+        rev_first_one = torch.flip(mask, dims=[1]).argmax(dim=1)
+        last_nonpad = mask.size(1) - 1 - rev_first_one
+        return last_nonpad
+
+    #
+    def __init__(
+            self,
+            config: Optional[dict] = None,
+            **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        # dynamic get llm config
+        num_vl_layers, llm_hidden_size = 36, self.qwen_vl_interface.model.config.hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
+        self.config.framework.qwenvl.num_vl_layers = num_vl_layers
+
+        action_model_cfg = getattr(self.config.framework, "action_model", None)
+        if action_model_cfg is not None:
+            action_model_kwargs = OmegaConf.to_container(action_model_cfg, resolve=True)
+            print(f"{action_model_kwargs=}")
+            self.action_model = ActionModelFM(ActionModelConfig(**action_model_kwargs))
+        else:
+            self.action_model = ActionModelFM(ActionModelConfig())
+        ckpt_path = getattr(self.config.framework.action_model, "ckpt_path", None)
+        if ckpt_path:
+            self.action_model.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=True)
+            print(f"✅ loaded action model from {ckpt_path}")
+        print(f"action model loss mode: {self.action_model.config.loss_mode}")
+        # Dataset soft prompt for QwenVL (conditioning on dataset_id)
+        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 32)
+        self.dataset_embed = nn.Embedding(
+            self.dataset_vocab_size,
+            llm_hidden_size * self.num_data_tokens,
+        )
+        # Learnable query token appended to VLM inputs (for action embedding)
+        self.query_token = nn.Parameter(torch.randn(1, 1, llm_hidden_size))
+
+        # 使用 MLP 投影器，增加表达能力（2048 → 2048 → 1024）
+        action_hidden_size = self.action_model.config.hidden_size
+        self.action_embed_projector = nn.Sequential(
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, action_hidden_size),
+        )
+
+        self.chunk_size = self.config.datasets.vla_data.chunk_size
+        self.num_history_steps = 0
+        self.use_state = self.action_model.use_state
+        # Multi-t sampling trick: sample K different t per example for the FM head
+        # to enlarge effective batch size without re-running the expensive VLM.
+        self.num_t_samples = getattr(self.config.framework.action_model, "num_t_samples", 1)
+        print(f"num_t_samples: {self.num_t_samples}")
+        
+    def _maybe_log_align_stats(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        gt_action_embeddings: torch.Tensor,
+    ) -> None:
+        if getattr(self, "_align_stats_logged", False):
+            return
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return
+        with torch.no_grad():
+            pred = predicted_action_embeddings.float()
+            gt = gt_action_embeddings.float()
+            pred_norm = pred.norm(dim=-1).mean().item()
+            gt_norm = gt.norm(dim=-1).mean().item()
+            logger.info(
+                "Align stats: pred(mean=%.4f,std=%.4f,avg_norm=%.4f) "
+                "gt(mean=%.4f,std=%.4f,avg_norm=%.4f)",
+                pred.mean().item(),
+                pred.std().item(),
+                pred_norm,
+                gt.mean().item(),
+                gt.std().item(),
+                gt_norm,
+            )
+        self._align_stats_logged = True
+
+    def forward(
+            self,
+            examples: List[dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  # [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B, L, action_dim]
+        states = [example["state"] for example in examples] if self.use_state else None  # [B, L, state_dim] when state_use_action_chunk
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions,
+            chunk_size=self.chunk_size,
+        )
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds, query_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"], query_mask), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                query_pos = (
+                    torch.full(
+                        (qwen_inputs["position_ids"].shape[0], 1),
+                        qwen_inputs["position_ids"].shape[1] + self.num_data_tokens,
+                        device=qwen_inputs["position_ids"].device,
+                        dtype=qwen_inputs["position_ids"].dtype,
+                    )
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens, query_pos), dim=1
+                )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+
+            last_hidden_states = qwenvl_outputs.hidden_states[-1]
+
+            if "attention_mask" in qwen_inputs:
+                # 找到非 padding 的最后一个 token index（兼容 left/right padding）
+                last_token_indices = self._get_last_nonpad_indices(qwen_inputs["attention_mask"])
+                batch_indices = torch.arange(last_hidden_states.shape[0], device=last_hidden_states.device)
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(action_token_hidden).float() # [B, Action_Hidden]
+            predicted_action_embeddings = F.normalize(predicted_action_embeddings, p=2, dim=-1)
+
+        # Step 2: Action Expert Forward and Loss
+        loss_mode = getattr(self.action_model.config, "loss_mode", "full")
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            actions_target = torch.as_tensor(np.array(actions), device=last_hidden_states.device, dtype=torch.float32)
+
+            # Multi-t sampling trick: expand the FM-head batch K times by sampling K
+            # independent t values per example.  The expensive VLM embedding is computed
+            # only once and then tiled, so the extra cost is only in the lightweight FM head.
+            K = self.num_t_samples
+
+            def tile_batch(x: torch.Tensor, k: int) -> torch.Tensor:
+                """Repeat tensor k times along dim-0, keeping all other dims intact."""
+                return x.repeat(k, *([1] * (x.dim() - 1)))
+
+            if K > 1:
+                actions_target_fm = tile_batch(actions_target, K)        # [K*B, T, D]
+                predicted_embeddings_fm = tile_batch(predicted_action_embeddings, K)
+            else:
+                actions_target_fm = actions_target
+                predicted_embeddings_fm = predicted_action_embeddings
+
+            B_fm = actions_target_fm.shape[0]
+            t = self.action_model._sample_fm_time(B_fm, device=actions_target.device, dtype=actions_target.dtype)
+            noise = torch.randn_like(actions_target_fm)
+
+            if loss_mode == "predict_only":
+                # Only predict_loss: skip align_loss and recon_loss
+                align_loss = None
+                recon_loss = None
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target_fm,
+                    action_embedding=predicted_embeddings_fm,
+                    t=t,
+                    noise=noise,
+                )
+            else:
+                # Full mode: align + recon + predict
+                # state chunk 与 action chunk 对齐（同长度）
+                states_target = None
+                if self.use_state:
+                    states_target = torch.as_tensor(np.array(states), device=last_hidden_states.device, dtype=torch.float32)
+
+                gt_action_embeddings = self.action_model.encode_actions(
+                    actions=actions_target,
+                    dataset_ids=dataset_ids,
+                    state=states_target,
+                )
+
+                self._maybe_log_align_stats(predicted_action_embeddings, gt_action_embeddings)
+
+                # align_loss only needs the original (non-expanded) embeddings
+                align_loss = F.l1_loss(predicted_action_embeddings, gt_action_embeddings.float().detach())
+
+                gt_embeddings_fm = tile_batch(gt_action_embeddings, K) if K > 1 else gt_action_embeddings
+                recon_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target_fm,
+                    action_embedding=gt_embeddings_fm,
+                    t=t,
+                    noise=noise,
+                )
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target_fm,
+                    action_embedding=predicted_embeddings_fm,
+                    t=t,
+                    noise=noise,
+                )
+
+        return {
+            "align_loss": align_loss,
+            "recon_loss": recon_loss,
+            "predict_loss": predict_loss,
+        }
+
+    @torch.inference_mode()
+    def predict_action(  # TODO align  predict_action with forward, make api more flexible
+            self,
+            examples: List[dict] = None,
+            embodiment_tag: Optional[str] = None,
+            **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+
+        Args:
+            examples: List of example dicts containing image, lang, etc.
+            embodiment_tag: Optional embodiment tag (e.g., "franka", "oxe_rt1", "oxe_bridge").
+                          If provided, will extract valid action dimensions based on ACTION_REPRESENTATION_SLICES.
+                          If None, returns full unified action representation.
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+                                                 If embodiment_tag is provided, shape is [B, T, valid_dim] where
+                                                 valid_dim is determined by ACTION_REPRESENTATION_SLICES[embodiment_tag].
+        """
+        from deployment.model_server.tools.image_tools import to_pil_preserve
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  # [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+
+        dataset_ids = [example.get("dataset_id") for example in examples]
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=batch_images,
+            instructions=instructions,
+        )
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds, query_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"], query_mask), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                query_pos = (
+                    torch.full(
+                        (qwen_inputs["position_ids"].shape[0], 1),
+                        qwen_inputs["position_ids"].shape[1] + self.num_data_tokens,
+                        device=qwen_inputs["position_ids"].device,
+                        dtype=qwen_inputs["position_ids"].dtype,
+                    )
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens, query_pos), dim=1
+                )
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            last_hidden_states = qwenvl_outputs.hidden_states[-1] 
+
+            if "attention_mask" in qwen_inputs:
+                # 找到非 padding 的最后一个 token index（兼容 left/right padding）
+                last_token_indices = self._get_last_nonpad_indices(qwen_inputs["attention_mask"])
+                batch_indices = torch.arange(last_hidden_states.shape[0], device=last_hidden_states.device)
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(action_token_hidden).float() # [B, Action_Hidden]
+            # L2 normalize before sending to decoder (consistent with training)
+            predicted_action_embeddings = F.normalize(predicted_action_embeddings, p=2, dim=-1)
+
+        # Step 4: 选择 decoder 进行推理
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.decode_actions(
+                predicted_action_embeddings, 
+                chunk_size=self.chunk_size
+            )
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        
+        # 如果提供了 embodiment_tag，根据 tag 提取有效的动作维度
+        if embodiment_tag is not None:
+            if embodiment_tag not in ACTION_REPRESENTATION_SLICES:
+                raise ValueError(
+                    f"Unknown embodiment tag '{embodiment_tag}'. "
+                    f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES.keys())}"
+                )
+            
+            # 获取对应的 slice
+            target_slice = ACTION_REPRESENTATION_SLICES[embodiment_tag]
+            
+            # 从统一表示中提取对应的维度
+            normalized_actions = normalized_actions[..., target_slice]
+        
+        return {"normalized_actions": normalized_actions}
+
+
+if __name__ == "__main__":
+
+    from omegaconf import OmegaConf
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str,
+                        default="/fsx/home/yfang/projects/LearnLatent/starVLA/config/training/starvla_train_qwenlatent_oxe.yaml",
+                        help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+
+
+    model = QwenLatent(cfg)
+    # ckpt="/mnt/petrelfs/yejinhui/Projects/llavavla/results/Checkpoints/1011_qwenpi/checkpoints/need_steps_10000_pytorch_model.pt"
+    # model = Qwen_PI.from_pretrained(ckpt)
+    print(model)
+
+    # fake sample
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(15, 7)).astype(np.float16),  # action_chunk, action_dim
+        "image": [image],  # two views
+        "image_past_half": [image],
+        "image_past_one": [image],
+        "image_future": [image],
+        "lang": "put the ball on the table",
+        "state": np.random.uniform(-1, 1, size=(1, 8)).astype(np.float16),  # chunk, state_dim
+    }
+
+    batch = [sample, sample]  # batch size 2
+    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    align_loss = forward_output['align_loss']
+    recon_loss = forward_output['recon_loss']
+    print(f"Align Loss: {align_loss.item()}")
+    print(f"Recon Loss: {recon_loss.item()}")
+
+    # # test predict action
+    # predict_output = model.predict_action([sample])
+    # normalized_actions = predict_output['normalized_actions']
+    # print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+    # #
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
+
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+
+    # # fake state
+    # for ba in batch:
+    #     ba["state"] = ba["action"][0][None]
+
+    # model(batch)
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])
\ No newline at end of file
diff --git a/code/model/framework/QwenLatent_history.py b/code/model/framework/QwenLatent_history.py
new file mode 100644
index 0000000000000000000000000000000000000000..330a2eb9b350067cb853f7a51bf8da88383e8921
--- /dev/null
+++ b/code/model/framework/QwenLatent_history.py
@@ -0,0 +1,568 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GROOT Framework
+A lightweight implementation that Qwen2.5-vl + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5, but a sample MoE inspired by PI_0
+"""
+import sys
+sys.path.append("/mnt/data/fangyu/code/rewardmodel")
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+import copy
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+from transformers import AutoImageProcessor, AutoModel
+from omegaconf import OmegaConf
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.ActionModel_FM import ActionModelFM
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+from starVLA.dataloader.gr00t_lerobot.datasets import ACTION_REPRESENTATION_SLICES
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+
+####################################################
+# ⚠️ Warning: This framework has been restructured and is NOT compatible with checkpoints created before 2025-10-20.
+####################################################
+
+@FRAMEWORK_REGISTRY.register("QwenLatent_history")
+class QwenLatentHistory(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise cross DiT diffusion head
+
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    @staticmethod
+    def _get_last_nonpad_indices(attention_mask: torch.Tensor) -> torch.Tensor:
+        """
+        Return the index of the last non-padding token for each sequence.
+
+        Works for both tokenizer.padding_side == "left" and "right".
+        attention_mask: [B, T] with 1/True for real tokens and 0/False for pads.
+        """
+        if attention_mask is None:
+            raise ValueError("attention_mask cannot be None")
+        if attention_mask.dim() != 2:
+            raise ValueError(f"attention_mask must be 2D [B,T], got shape {tuple(attention_mask.shape)}")
+
+        # Find distance-from-end to last 1 by reversing sequence dimension.
+        # Example:
+        # - left pad:  [0,0,1,1,1] -> flip -> [1,1,1,0,0] -> argmax = 0 -> last = T-1
+        # - right pad: [1,1,1,0,0] -> flip -> [0,0,1,1,1] -> argmax = 2 -> last = T-1-2 = 2
+        mask = attention_mask.to(dtype=torch.long)
+        rev_first_one = torch.flip(mask, dims=[1]).argmax(dim=1)
+        last_nonpad = mask.size(1) - 1 - rev_first_one
+        return last_nonpad
+
+    #
+    def __init__(
+            self,
+            config: Optional[dict] = None,
+            **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        # dynamic get llm config
+        num_vl_layers, llm_hidden_size = 36, self.qwen_vl_interface.model.config.hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
+        self.config.framework.qwenvl.num_vl_layers = num_vl_layers
+
+        action_model_cfg = getattr(self.config.framework, "action_model", None)
+        if action_model_cfg is not None:
+            action_model_kwargs = OmegaConf.to_container(action_model_cfg, resolve=True)
+            print(f"{action_model_kwargs=}")
+            self.action_model = ActionModelFM(ActionModelConfig(**action_model_kwargs))
+        else:
+            self.action_model = ActionModelFM(ActionModelConfig())
+        ckpt_path = getattr(self.config.framework.action_model, "ckpt_path", None)
+        if ckpt_path:
+            self.action_model.load_state_dict(torch.load(ckpt_path, map_location="cpu"), strict=True)
+            print(f"✅ loaded action model from {ckpt_path}")
+        print(f"action model loss mode: {self.action_model.config.loss_mode}")
+        # Dataset soft prompt for QwenVL (conditioning on dataset_id)
+        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 32)
+        self.dataset_embed = nn.Embedding(
+            self.dataset_vocab_size,
+            llm_hidden_size * self.num_data_tokens,
+        )
+        # Learnable query token appended to VLM inputs (for action embedding)
+        self.query_token = nn.Parameter(torch.randn(1, 1, llm_hidden_size))
+
+        # 使用 MLP 投影器，增加表达能力（2048 → 2048 → 1024）
+        action_hidden_size = self.action_model.config.hidden_size
+        self.action_embed_projector = nn.Sequential(
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, action_hidden_size),
+        )
+
+        self.total_action_chunk_size = self.config.datasets.vla_data.chunk_size
+        self.num_history_steps = self.config.datasets.vla_data.num_history_steps
+        print(f"num_history_steps: {self.num_history_steps}")
+        self.history_action_embed_proj = nn.Linear(action_hidden_size, llm_hidden_size)
+        self.chunk_size = self.total_action_chunk_size - self.num_history_steps
+        self.use_state = self.action_model.use_state
+        
+    def _maybe_log_align_stats(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        gt_action_embeddings: torch.Tensor,
+    ) -> None:
+        if getattr(self, "_align_stats_logged", False):
+            return
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return
+        with torch.no_grad():
+            pred = predicted_action_embeddings.float()
+            gt = gt_action_embeddings.float()
+            pred_norm = pred.norm(dim=-1).mean().item()
+            gt_norm = gt.norm(dim=-1).mean().item()
+            logger.info(
+                "Align stats: pred(mean=%.4f,std=%.4f,avg_norm=%.4f) "
+                "gt(mean=%.4f,std=%.4f,avg_norm=%.4f)",
+                pred.mean().item(),
+                pred.std().item(),
+                pred_norm,
+                gt.mean().item(),
+                gt.std().item(),
+                gt_norm,
+            )
+        self._align_stats_logged = True
+
+    def _build_qwen_inputs(
+        self,
+        images: List,
+        instructions: List[str],
+        dataset_ids: List[int],
+        extra_prefix_embeds: Optional[torch.Tensor] = None,
+    ) -> dict:
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=images,
+            instructions=instructions,
+        )
+
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+
+            embed_parts = [ds_embeds, token_embeds]
+            if extra_prefix_embeds is not None:
+                embed_parts.append(extra_prefix_embeds)
+            embed_parts.append(query_embeds)
+            qwen_inputs["inputs_embeds"] = torch.cat(embed_parts, dim=1)
+            qwen_inputs.pop("input_ids")
+
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                mask_parts = [prefix_mask, qwen_inputs["attention_mask"]]
+                if extra_prefix_embeds is not None:
+                    history_mask = torch.ones(
+                        (qwen_inputs["attention_mask"].shape[0], extra_prefix_embeds.shape[1]),
+                        device=qwen_inputs["attention_mask"].device,
+                        dtype=qwen_inputs["attention_mask"].dtype,
+                    )
+                    mask_parts.append(history_mask)
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                mask_parts.append(query_mask)
+                qwen_inputs["attention_mask"] = torch.cat(mask_parts, dim=1)
+
+            if "position_ids" in qwen_inputs:
+                extra_prefix_len = 0 if extra_prefix_embeds is None else extra_prefix_embeds.shape[1]
+                prefix_total_len = self.num_data_tokens + extra_prefix_len
+                prefix_pos = torch.arange(
+                    prefix_total_len,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                query_pos = torch.full(
+                    (qwen_inputs["position_ids"].shape[0], 1),
+                    qwen_inputs["position_ids"].shape[1] + prefix_total_len,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + prefix_total_len, query_pos), dim=1
+                )
+        return qwen_inputs
+
+    def _encode_vlm_action_embedding(self, qwen_inputs: dict) -> torch.Tensor:
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            last_hidden_states = qwenvl_outputs.hidden_states[-1]
+
+            if "attention_mask" in qwen_inputs:
+                last_token_indices = self._get_last_nonpad_indices(qwen_inputs["attention_mask"])
+                batch_indices = torch.arange(last_hidden_states.shape[0], device=last_hidden_states.device)
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(action_token_hidden).float()
+            predicted_action_embeddings = F.normalize(predicted_action_embeddings, p=2, dim=-1)
+        return predicted_action_embeddings
+
+    def _compute_branch_losses(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        actions_target: torch.Tensor,
+        states_target: Optional[torch.Tensor],
+        dataset_ids: List[int],
+    ) -> dict:
+        loss_mode = getattr(self.action_model.config, "loss_mode", "full")
+        with torch.autocast("cuda", dtype=torch.float32):
+            B = actions_target.shape[0]
+            t = self.action_model._sample_fm_time(B, device=actions_target.device, dtype=actions_target.dtype)
+            noise = torch.randn_like(actions_target)
+
+            if loss_mode == "predict_only":
+                align_loss = None
+                recon_loss = None
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+            else:
+                gt_action_embeddings = self.action_model.encode_actions(
+                    actions=actions_target,
+                    dataset_ids=dataset_ids,
+                    state=states_target,
+                )
+                self._maybe_log_align_stats(predicted_action_embeddings, gt_action_embeddings)
+
+                align_loss = F.l1_loss(predicted_action_embeddings, gt_action_embeddings.float().detach())
+                recon_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=gt_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+        return {
+            "align_loss": align_loss,
+            "recon_loss": recon_loss,
+            "predict_loss": predict_loss,
+        }
+
+    def forward(
+            self,
+            examples: List[dict] = None,
+            **kwargs,
+    ):
+        """
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  # [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B, L, action_dim]
+        states = [example["state"] for example in examples] if self.use_state else None  # [B, L, state_dim] when state_use_action_chunk
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        device = self.query_token.device
+        actions_full = torch.as_tensor(np.array(actions), device=device, dtype=torch.float32)
+        assert actions_full.shape[1] == self.total_action_chunk_size
+
+        states_full = None
+        if self.use_state:
+            states_full = torch.as_tensor(np.array(states), device=device, dtype=torch.float32)
+            assert states_full.shape[1] == self.total_action_chunk_size
+
+        # Branch 1 (no history): image at step 0, predict actions from t=num_history_steps onward.
+        no_hist_qwen_inputs = self._build_qwen_inputs(
+            images=batch_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=None,
+        )
+        no_hist_pred_emb = self._encode_vlm_action_embedding(no_hist_qwen_inputs)
+        no_hist_losses = self._compute_branch_losses(
+            predicted_action_embeddings=no_hist_pred_emb,
+            actions_target=actions_full[:, :self.chunk_size],
+            states_target=states_full[:, :self.chunk_size] if states_full is not None else None,
+            dataset_ids=dataset_ids,
+        )
+
+        # If no history is configured, keep the original single-branch behavior.
+        if self.num_history_steps <= 0:
+            return no_hist_losses
+
+        # Branch 2 (with history): use mid_image and prepend encoded history action embedding before query token.
+        if not all("mid_image" in ex for ex in examples):
+            raise ValueError("num_history_steps > 0 but `mid_image` is missing in examples.")
+        mid_images = [example["mid_image"] for example in examples]
+
+        history_actions = actions_full[:, :self.num_history_steps]
+        history_states = states_full[:, :self.num_history_steps] if states_full is not None else None
+        history_action_embedding = self.action_model.encode_actions(
+            actions=history_actions,
+            dataset_ids=dataset_ids,
+            state=history_states,
+        )
+        history_action_embedding = self.history_action_embed_proj(history_action_embedding)
+        hist_qwen_inputs = self._build_qwen_inputs(
+            images=mid_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=history_action_embedding,
+        )
+        hist_pred_emb = self._encode_vlm_action_embedding(hist_qwen_inputs)
+        hist_losses = self._compute_branch_losses(
+            predicted_action_embeddings=hist_pred_emb,
+            actions_target=actions_full[:, self.num_history_steps:],
+            states_target=states_full[:, self.num_history_steps:] if states_full is not None else None,
+            dataset_ids=dataset_ids,
+        )
+
+        if self.num_history_steps > 0:
+            return {
+                "align_loss": 0.5 * hist_losses["align_loss"] + 0.5 * no_hist_losses["align_loss"],
+                "recon_loss": 0.5 * hist_losses["recon_loss"] + 0.5 * no_hist_losses["recon_loss"],
+                "predict_loss": 0.5 * hist_losses["predict_loss"] + 0.5 * no_hist_losses["predict_loss"],
+            }
+        else:
+            return no_hist_losses
+
+    @torch.inference_mode()
+    def predict_action(  # TODO align  predict_action with forward, make api more flexible
+            self,
+            examples: List[dict] = None,
+            embodiment_tag: Optional[str] = None,
+            use_history: bool = True,
+            **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+
+        Args:
+            examples: List of example dicts containing image, lang, etc.
+            embodiment_tag: Optional embodiment tag (e.g., "franka", "oxe_rt1", "oxe_bridge").
+                          If provided, will extract valid action dimensions based on ACTION_REPRESENTATION_SLICES.
+                          If None, returns full unified action representation.
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+                                                 If embodiment_tag is provided, shape is [B, T, valid_dim] where
+                                                 valid_dim is determined by ACTION_REPRESENTATION_SLICES[embodiment_tag].
+        """
+        from deployment.model_server.tools.image_tools import to_pil_preserve
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        # Default branch: use first-frame image only.
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  # [B, [PIL]]
+        extra_prefix_embeds = None
+
+        # History branch for inference/eval: use mid_image + encoded history actions.
+        if self.num_history_steps > 0 and use_history:
+            if not all(("mid_image" in ex and "action" in ex) for ex in examples):
+                raise ValueError(
+                    "num_history_steps > 0 requires `mid_image` and `action` in each example for history inference."
+                )
+            batch_images = [to_pil_preserve(example["mid_image"]) for example in examples]
+            history_actions_np = np.array([example["action"][: self.num_history_steps] for example in examples])
+            action_model_dtype = self.action_model.action_proj_in.weight.dtype
+            history_actions = torch.as_tensor(
+                history_actions_np, device=self.query_token.device, dtype=action_model_dtype
+            )
+            history_states = None
+            if self.use_state and all("state" in ex for ex in examples):
+                history_states_np = np.array([example["state"][: self.num_history_steps] for example in examples])
+                history_states = torch.as_tensor(
+                    history_states_np, device=self.query_token.device, dtype=action_model_dtype
+                )
+            history_action_embedding = self.action_model.encode_actions(
+                actions=history_actions,
+                dataset_ids=dataset_ids,
+                state=history_states,
+            )
+            extra_prefix_embeds = self.history_action_embed_proj(history_action_embedding)
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        qwen_inputs = self._build_qwen_inputs(
+            images=batch_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=extra_prefix_embeds,
+        )
+        predicted_action_embeddings = self._encode_vlm_action_embedding(qwen_inputs)
+
+        # Step 4: 选择 decoder 进行推理
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.decode_actions(
+                predicted_action_embeddings, 
+                chunk_size=self.chunk_size
+            )
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        
+        # 如果提供了 embodiment_tag，根据 tag 提取有效的动作维度
+        if embodiment_tag is not None:
+            if embodiment_tag not in ACTION_REPRESENTATION_SLICES:
+                raise ValueError(
+                    f"Unknown embodiment tag '{embodiment_tag}'. "
+                    f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES.keys())}"
+                )
+            
+            # 获取对应的 slice
+            target_slice = ACTION_REPRESENTATION_SLICES[embodiment_tag]
+            
+            # 从统一表示中提取对应的维度
+            normalized_actions = normalized_actions[..., target_slice]
+        
+        return {"normalized_actions": normalized_actions}
+
+
+if __name__ == "__main__":
+
+    from omegaconf import OmegaConf
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str,
+                        default="/fsx/home/yfang/projects/LearnLatent/starVLA/config/training/starvla_train_qwenlatent_oxe.yaml",
+                        help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+
+
+    model = QwenLatent(cfg)
+    # ckpt="/mnt/petrelfs/yejinhui/Projects/llavavla/results/Checkpoints/1011_qwenpi/checkpoints/need_steps_10000_pytorch_model.pt"
+    # model = Qwen_PI.from_pretrained(ckpt)
+    print(model)
+
+    # fake sample
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(15, 7)).astype(np.float16),  # action_chunk, action_dim
+        "image": [image],  # two views
+        "image_past_half": [image],
+        "image_past_one": [image],
+        "image_future": [image],
+        "lang": "put the ball on the table",
+        "state": np.random.uniform(-1, 1, size=(1, 8)).astype(np.float16),  # chunk, state_dim
+    }
+
+    batch = [sample, sample]  # batch size 2
+    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    align_loss = forward_output['align_loss']
+    recon_loss = forward_output['recon_loss']
+    print(f"Align Loss: {align_loss.item()}")
+    print(f"Recon Loss: {recon_loss.item()}")
+
+    # # test predict action
+    # predict_output = model.predict_action([sample])
+    # normalized_actions = predict_output['normalized_actions']
+    # print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+    # #
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
+
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+
+    # # fake state
+    # for ba in batch:
+    #     ba["state"] = ba["action"][0][None]
+
+    # model(batch)
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])
\ No newline at end of file
diff --git a/code/model/framework/QwenLatent_history_naive.py b/code/model/framework/QwenLatent_history_naive.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cc60018a30f338ef6b85783708808fa98dab843
--- /dev/null
+++ b/code/model/framework/QwenLatent_history_naive.py
@@ -0,0 +1,595 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by Jinhui YE / HKUST University in [2025].
+"""
+QwenLatent History Naive Baseline
+
+Ablation / baseline variant of QwenLatent_history.  Instead of using a
+dedicated action encoder (Qwen3-based transformer) to compress history
+action+state sequences into a compact latent embedding, this model projects
+each history timestep directly into the VLM token space via two lightweight
+MLP projectors:
+
+  - history_action_projector : R^{action_size}  -> R^{llm_hidden_size}
+  - history_state_projector  : R^{state_size}   -> R^{llm_hidden_size}
+
+The resulting per-step tokens are interleaved as
+  [a_0, s_0, a_1, s_1, ..., a_{T-1}, s_{T-1}]
+and prepended to the VLM context (after the dataset soft-prompt, before the
+visual/language tokens).
+
+This preserves the identical training objective, loss weights, and dual-branch
+(no-history / with-history) structure as QwenLatent_history, so results are
+directly comparable.  The only difference is how history information is
+encoded: here we use a flat MLP projection instead of the action encoder.
+"""
+
+import sys
+sys.path.append("/mnt/data/fangyu/code/rewardmodel")
+
+from typing import List, Optional
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from omegaconf import OmegaConf
+from PIL import Image
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.ActionModel_FM import ActionModelFM
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+from starVLA.dataloader.gr00t_lerobot.datasets import ACTION_REPRESENTATION_SLICES
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+logger = initialize_overwatch(__name__)
+
+IGNORE_INDEX = -100
+
+
+@FRAMEWORK_REGISTRY.register("QwenLatent_history_naive")
+class QwenLatentHistoryNaive(baseframework):
+    """
+    Naive history baseline: project each history step's action and state
+    independently via MLP projectors and append the resulting token sequence
+    to the VLM context.
+
+    Architecture overview
+    ---------------------
+    Input (with history)::
+
+        [ds_embed | hist_action_0 | hist_state_0 | ... |
+          hist_action_{T-1} | hist_state_{T-1} |
+          VL_tokens | query_token]
+
+    Compared to QwenLatent_history, the action-encoder is only used as the
+    flow-matching *decoder* here — its *encoder* path is bypassed for history
+    encoding.  The action model itself is still used for:
+      - Computing GT action embeddings (for align loss)
+      - Decoding predicted embeddings to actions during inference
+    """
+
+    # ------------------------------------------------------------------
+    # Helper: last non-pad token index
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _get_last_nonpad_indices(attention_mask: torch.Tensor) -> torch.Tensor:
+        if attention_mask is None:
+            raise ValueError("attention_mask cannot be None")
+        if attention_mask.dim() != 2:
+            raise ValueError(
+                f"attention_mask must be 2D [B,T], got shape {tuple(attention_mask.shape)}"
+            )
+        mask = attention_mask.to(dtype=torch.long)
+        rev_first_one = torch.flip(mask, dims=[1]).argmax(dim=1)
+        last_nonpad = mask.size(1) - 1 - rev_first_one
+        return last_nonpad
+
+    # ------------------------------------------------------------------
+    # Construction
+    # ------------------------------------------------------------------
+    def __init__(self, config: Optional[dict] = None, **kwargs) -> None:
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        num_vl_layers, llm_hidden_size = 36, self.qwen_vl_interface.model.config.hidden_size
+        self.llm_hidden_size = llm_hidden_size
+        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
+        self.config.framework.qwenvl.num_vl_layers = num_vl_layers
+
+        # Action model (used only as flow-matching decoder + GT encoder for loss)
+        action_model_cfg = getattr(self.config.framework, "action_model", None)
+        if action_model_cfg is not None:
+            action_model_kwargs = OmegaConf.to_container(action_model_cfg, resolve=True)
+            print(f"{action_model_kwargs=}")
+            self.action_model = ActionModelFM(ActionModelConfig(**action_model_kwargs))
+        else:
+            self.action_model = ActionModelFM(ActionModelConfig())
+
+        ckpt_path = getattr(self.config.framework.action_model, "ckpt_path", None)
+        if ckpt_path:
+            self.action_model.load_state_dict(
+                torch.load(ckpt_path, map_location="cpu"), strict=True
+            )
+            print(f"✅ loaded action model from {ckpt_path}")
+        print(f"action model loss mode: {self.action_model.config.loss_mode}")
+
+        # Dataset soft-prompt embedding
+        self.dataset_vocab_size = getattr(
+            self.config.framework.action_model, "dataset_vocab_size", 256
+        )
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 32)
+        self.dataset_embed = nn.Embedding(
+            self.dataset_vocab_size,
+            llm_hidden_size * self.num_data_tokens,
+        )
+
+        # Learnable query token (VLM output token used for action prediction)
+        self.query_token = nn.Parameter(torch.randn(1, 1, llm_hidden_size))
+
+        # VLM → action-space projector (query token hidden → action embedding)
+        action_hidden_size = self.action_model.config.hidden_size
+        self.action_embed_projector = nn.Sequential(
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, action_hidden_size),
+        )
+
+        # Chunk / history book-keeping
+        self.total_action_chunk_size = self.config.datasets.vla_data.chunk_size
+        self.num_history_steps = self.config.datasets.vla_data.num_history_steps
+        print(f"num_history_steps: {self.num_history_steps}")
+        self.chunk_size = self.total_action_chunk_size - self.num_history_steps
+        self.use_state = self.action_model.use_state
+
+        # ------------------------------------------------------------------
+        # Naive history projectors
+        # Each history timestep's raw action / state is projected to a single
+        # VLM-dimension token via a two-layer MLP.
+        # ------------------------------------------------------------------
+        action_size = self.action_model.config.action_size
+        state_size = self.action_model.config.state_size
+
+        self.history_action_projector = nn.Sequential(
+            nn.Linear(action_size, llm_hidden_size),
+            nn.GELU(),
+            nn.Linear(llm_hidden_size, llm_hidden_size),
+        )
+
+        if self.use_state and state_size > 0:
+            self.history_state_projector = nn.Sequential(
+                nn.Linear(state_size, llm_hidden_size),
+                nn.GELU(),
+                nn.Linear(llm_hidden_size, llm_hidden_size),
+            )
+        else:
+            self.history_state_projector = None
+
+    # ------------------------------------------------------------------
+    # Private helpers
+    # ------------------------------------------------------------------
+    def _maybe_log_align_stats(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        gt_action_embeddings: torch.Tensor,
+    ) -> None:
+        if getattr(self, "_align_stats_logged", False):
+            return
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            if torch.distributed.get_rank() != 0:
+                return
+        with torch.no_grad():
+            pred = predicted_action_embeddings.float()
+            gt = gt_action_embeddings.float()
+            logger.info(
+                "Align stats: pred(mean=%.4f,std=%.4f,avg_norm=%.4f) "
+                "gt(mean=%.4f,std=%.4f,avg_norm=%.4f)",
+                pred.mean().item(),
+                pred.std().item(),
+                pred.norm(dim=-1).mean().item(),
+                gt.mean().item(),
+                gt.std().item(),
+                gt.norm(dim=-1).mean().item(),
+            )
+        self._align_stats_logged = True
+
+    def _encode_history_tokens(
+        self,
+        history_actions: torch.Tensor,
+        history_states: Optional[torch.Tensor],
+    ) -> torch.Tensor:
+        """
+        Project raw history actions (and optionally states) into VLM token space.
+
+        Args:
+            history_actions : [B, T_hist, action_size]  float32
+            history_states  : [B, T_hist, state_size]   float32 or None
+
+        Returns:
+            history_tokens  : [B, T_hist * (1 or 2), llm_hidden_size]
+                              Interleaved as [a_0, s_0, a_1, s_1, ...] when
+                              state is available, otherwise [a_0, a_1, ...].
+        """
+        B, T, _ = history_actions.shape
+
+        # Cast to model dtype for the projectors
+        proj_dtype = self.history_action_projector[0].weight.dtype
+        act = history_actions.to(proj_dtype)
+
+        act_tokens = self.history_action_projector(act)  # [B, T, llm_hidden_size]
+
+        if self.history_state_projector is not None and history_states is not None:
+            sta = history_states.to(proj_dtype)
+            sta_tokens = self.history_state_projector(sta)  # [B, T, llm_hidden_size]
+            # Interleave: [a_0, s_0, a_1, s_1, ...]
+            # Stack along a new dim then reshape: [B, T, 2, H] → [B, 2T, H]
+            interleaved = torch.stack([act_tokens, sta_tokens], dim=2)  # [B, T, 2, H]
+            history_tokens = interleaved.view(B, T * 2, self.llm_hidden_size)
+        else:
+            history_tokens = act_tokens  # [B, T, llm_hidden_size]
+
+        return history_tokens
+
+    def _build_qwen_inputs(
+        self,
+        images: List,
+        instructions: List[str],
+        dataset_ids: List[int],
+        extra_prefix_embeds: Optional[torch.Tensor] = None,
+    ) -> dict:
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(
+            images=images,
+            instructions=instructions,
+        )
+
+        if "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids,
+                device=qwen_inputs["input_ids"].device,
+                dtype=torch.long,
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(
+                qwen_inputs["input_ids"]
+            )
+            query_embeds = self.query_token.expand(len(dataset_ids), -1, -1)
+
+            embed_parts = [ds_embeds, token_embeds]
+            if extra_prefix_embeds is not None:
+                embed_parts.append(extra_prefix_embeds)
+            embed_parts.append(query_embeds)
+            qwen_inputs["inputs_embeds"] = torch.cat(embed_parts, dim=1)
+            qwen_inputs.pop("input_ids")
+
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                mask_parts = [prefix_mask, qwen_inputs["attention_mask"]]
+                if extra_prefix_embeds is not None:
+                    history_mask = torch.ones(
+                        (
+                            qwen_inputs["attention_mask"].shape[0],
+                            extra_prefix_embeds.shape[1],
+                        ),
+                        device=qwen_inputs["attention_mask"].device,
+                        dtype=qwen_inputs["attention_mask"].dtype,
+                    )
+                    mask_parts.append(history_mask)
+                query_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], 1),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                mask_parts.append(query_mask)
+                qwen_inputs["attention_mask"] = torch.cat(mask_parts, dim=1)
+
+            if "position_ids" in qwen_inputs:
+                extra_prefix_len = (
+                    0 if extra_prefix_embeds is None else extra_prefix_embeds.shape[1]
+                )
+                prefix_total_len = self.num_data_tokens + extra_prefix_len
+                prefix_pos = (
+                    torch.arange(
+                        prefix_total_len,
+                        device=qwen_inputs["position_ids"].device,
+                        dtype=qwen_inputs["position_ids"].dtype,
+                    )
+                    .unsqueeze(0)
+                    .expand(qwen_inputs["position_ids"].shape[0], -1)
+                )
+                query_pos = torch.full(
+                    (qwen_inputs["position_ids"].shape[0], 1),
+                    qwen_inputs["position_ids"].shape[1] + prefix_total_len,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                )
+                qwen_inputs["position_ids"] = torch.cat(
+                    (
+                        prefix_pos,
+                        qwen_inputs["position_ids"] + prefix_total_len,
+                        query_pos,
+                    ),
+                    dim=1,
+                )
+        return qwen_inputs
+
+    def _encode_vlm_action_embedding(self, qwen_inputs: dict) -> torch.Tensor:
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            last_hidden_states = qwenvl_outputs.hidden_states[-1]
+
+            if "attention_mask" in qwen_inputs:
+                last_token_indices = self._get_last_nonpad_indices(
+                    qwen_inputs["attention_mask"]
+                )
+                batch_indices = torch.arange(
+                    last_hidden_states.shape[0], device=last_hidden_states.device
+                )
+                action_token_hidden = last_hidden_states[batch_indices, last_token_indices]
+            else:
+                action_token_hidden = last_hidden_states[:, -1, :]
+
+            predicted_action_embeddings = self.action_embed_projector(
+                action_token_hidden
+            ).float()
+            predicted_action_embeddings = F.normalize(
+                predicted_action_embeddings, p=2, dim=-1
+            )
+        return predicted_action_embeddings
+
+    def _compute_branch_losses(
+        self,
+        predicted_action_embeddings: torch.Tensor,
+        actions_target: torch.Tensor,
+        states_target: Optional[torch.Tensor],
+        dataset_ids: List[int],
+    ) -> dict:
+        loss_mode = getattr(self.action_model.config, "loss_mode", "full")
+        with torch.autocast("cuda", dtype=torch.float32):
+            B = actions_target.shape[0]
+            t = self.action_model._sample_fm_time(
+                B, device=actions_target.device, dtype=actions_target.dtype
+            )
+            noise = torch.randn_like(actions_target)
+
+            if loss_mode == "predict_only":
+                align_loss = None
+                recon_loss = None
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+            else:
+                gt_action_embeddings = self.action_model.encode_actions(
+                    actions=actions_target,
+                    dataset_ids=dataset_ids,
+                    state=states_target,
+                )
+                self._maybe_log_align_stats(predicted_action_embeddings, gt_action_embeddings)
+
+                align_loss = F.l1_loss(
+                    predicted_action_embeddings,
+                    gt_action_embeddings.float().detach(),
+                )
+                recon_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=gt_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+                predict_loss = self.action_model.recon_loss_from_embedding(
+                    actions=actions_target,
+                    action_embedding=predicted_action_embeddings,
+                    t=t,
+                    noise=noise,
+                )
+        return {
+            "align_loss": align_loss,
+            "recon_loss": recon_loss,
+            "predict_loss": predict_loss,
+        }
+
+    # ------------------------------------------------------------------
+    # Forward (training)
+    # ------------------------------------------------------------------
+    def forward(self, examples: List[dict] = None, **kwargs):
+        """
+        Dual-branch forward (mirrors QwenLatent_history exactly):
+
+        Branch 1 — no history:
+            Image at step 0, predict actions[0 : chunk_size].
+
+        Branch 2 — with history (only when num_history_steps > 0):
+            mid_image (image at step num_history_steps), with naive MLP
+            projection of history actions/states prepended to VLM context.
+            Predict actions[num_history_steps : total_chunk_size].
+
+        Returns combined losses (average of both branches).
+        """
+        batch_images = [ex["image"] for ex in examples]
+        instructions = [ex["lang"] for ex in examples]
+        actions = [ex["action"] for ex in examples]
+        states = [ex["state"] for ex in examples] if self.use_state else None
+        dataset_ids = [ex.get("dataset_id", 0) for ex in examples]
+
+        device = self.query_token.device
+        actions_full = torch.as_tensor(
+            np.array(actions), device=device, dtype=torch.float32
+        )
+        assert actions_full.shape[1] == self.total_action_chunk_size
+
+        states_full = None
+        if self.use_state:
+            states_full = torch.as_tensor(
+                np.array(states), device=device, dtype=torch.float32
+            )
+            assert states_full.shape[1] == self.total_action_chunk_size
+
+        # ---------- Branch 1: no history ----------
+        no_hist_qwen_inputs = self._build_qwen_inputs(
+            images=batch_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=None,
+        )
+        no_hist_pred_emb = self._encode_vlm_action_embedding(no_hist_qwen_inputs)
+        no_hist_losses = self._compute_branch_losses(
+            predicted_action_embeddings=no_hist_pred_emb,
+            actions_target=actions_full[:, : self.chunk_size],
+            states_target=(
+                states_full[:, : self.chunk_size] if states_full is not None else None
+            ),
+            dataset_ids=dataset_ids,
+        )
+
+        if self.num_history_steps <= 0:
+            return no_hist_losses
+
+        # ---------- Branch 2: naive history ----------
+        if not all("mid_image" in ex for ex in examples):
+            raise ValueError("num_history_steps > 0 but `mid_image` is missing in examples.")
+        mid_images = [ex["mid_image"] for ex in examples]
+
+        history_actions = actions_full[:, : self.num_history_steps]
+        history_states = (
+            states_full[:, : self.num_history_steps]
+            if states_full is not None
+            else None
+        )
+
+        # Project raw history tokens via naive MLP (the key difference)
+        history_tokens = self._encode_history_tokens(history_actions, history_states)
+
+        hist_qwen_inputs = self._build_qwen_inputs(
+            images=mid_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=history_tokens,
+        )
+        hist_pred_emb = self._encode_vlm_action_embedding(hist_qwen_inputs)
+        hist_losses = self._compute_branch_losses(
+            predicted_action_embeddings=hist_pred_emb,
+            actions_target=actions_full[:, self.num_history_steps :],
+            states_target=(
+                states_full[:, self.num_history_steps :]
+                if states_full is not None
+                else None
+            ),
+            dataset_ids=dataset_ids,
+        )
+
+        return {
+            "align_loss": 0.5 * hist_losses["align_loss"]
+            + 0.5 * no_hist_losses["align_loss"],
+            "recon_loss": 0.5 * hist_losses["recon_loss"]
+            + 0.5 * no_hist_losses["recon_loss"],
+            "predict_loss": 0.5 * hist_losses["predict_loss"]
+            + 0.5 * no_hist_losses["predict_loss"],
+        }
+
+    # ------------------------------------------------------------------
+    # Inference
+    # ------------------------------------------------------------------
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict] = None,
+        embodiment_tag: Optional[str] = None,
+        use_history: bool = True,
+        **kwargs,
+    ) -> dict:
+        """
+        Inference counterpart.
+
+        When ``use_history=True`` and ``num_history_steps > 0``, uses
+        ``mid_image`` together with naive MLP projections of history
+        actions/states.  Otherwise falls back to the no-history branch.
+        """
+        from deployment.model_server.tools.image_tools import to_pil_preserve
+
+        instructions = [ex["lang"] for ex in examples]
+        dataset_ids = [ex.get("dataset_id", 0) for ex in examples]
+
+        batch_images = [to_pil_preserve(ex["image"]) for ex in examples]
+        extra_prefix_embeds = None
+
+        if self.num_history_steps > 0 and use_history:
+            if not all(("mid_image" in ex and "action" in ex) for ex in examples):
+                raise ValueError(
+                    "num_history_steps > 0 requires `mid_image` and `action` in each "
+                    "example for history inference."
+                )
+            batch_images = [to_pil_preserve(ex["mid_image"]) for ex in examples]
+
+            proj_dtype = self.history_action_projector[0].weight.dtype
+            history_actions_np = np.array(
+                [ex["action"][: self.num_history_steps] for ex in examples]
+            )
+            history_actions = torch.as_tensor(
+                history_actions_np,
+                device=self.query_token.device,
+                dtype=proj_dtype,
+            )
+
+            history_states = None
+            if self.use_state and all("state" in ex for ex in examples):
+                history_states_np = np.array(
+                    [ex["state"][: self.num_history_steps] for ex in examples]
+                )
+                history_states = torch.as_tensor(
+                    history_states_np,
+                    device=self.query_token.device,
+                    dtype=proj_dtype,
+                )
+
+            extra_prefix_embeds = self._encode_history_tokens(
+                history_actions, history_states
+            )
+
+        train_obs_image_size = getattr(
+            self.config.datasets.vla_data, "image_size", None
+        )
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        qwen_inputs = self._build_qwen_inputs(
+            images=batch_images,
+            instructions=instructions,
+            dataset_ids=dataset_ids,
+            extra_prefix_embeds=extra_prefix_embeds,
+        )
+        predicted_action_embeddings = self._encode_vlm_action_embedding(qwen_inputs)
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.decode_actions(
+                predicted_action_embeddings,
+                chunk_size=self.chunk_size,
+            )
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+
+        if embodiment_tag is not None:
+            if embodiment_tag not in ACTION_REPRESENTATION_SLICES:
+                raise ValueError(
+                    f"Unknown embodiment tag '{embodiment_tag}'. "
+                    f"Known tags: {sorted(ACTION_REPRESENTATION_SLICES.keys())}"
+                )
+            target_slice = ACTION_REPRESENTATION_SLICES[embodiment_tag]
+            normalized_actions = normalized_actions[..., target_slice]
+
+        return {"normalized_actions": normalized_actions}
diff --git a/code/model/framework/QwenOFT.py b/code/model/framework/QwenOFT.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc8060ff49c8dc399302e0a8bb37e095fa49cbc
--- /dev/null
+++ b/code/model/framework/QwenOFT.py
@@ -0,0 +1,338 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025]. 
+
+"""
+Qwen-OFT Framework
+
+A lightweight implementation that uses an action special token to parallelly predict continuous actions
+conditioned on multi-view images plus a language instruction (shares parameters with the VLM).
+Inspired by OpenVLA-OFT
+Key Points:
+  - Qwen2.5 vision-language backbone
+  - Injects an action special token into the VLM
+  - Continuous action prediction via L1 regression over the action special token hidden states
+
+
+Note: How to add special tokens to Qwen2.5:
+  download our model checkpoint with special tokens added: https://huggingface.co/StarVLA/Qwen2.5-VL-3B-Instruct-Action
+  or /starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md （adpat a little code)
+  
+"""
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.MLP_ActionHeader import get_action_model
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+
+@FRAMEWORK_REGISTRY.register("QwenOFT")
+class Qwenvl_OFT(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise QFormer for multi-layer feature aggregation
+      - DINO encoder for dense multi-view spatial tokens
+      - DiT diffusion head for future action sequence modeling
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+        # align dims --> we should put them to config or no?
+        config.framework.action_model.action_hidden_dim = self.qwen_vl_interface.model.config.hidden_size
+        self.action_model = get_action_model(config=self.config)
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+        # self.hidden_dim = config.framework.action_model.action_hidden_dim
+        
+        self.action_token = "🔍" # TODO also can add spacail token to Qwen, but too complex
+        self.action_token_id = self.qwen_vl_interface.processor.tokenizer("🔍", add_special_tokens=False)["input_ids"][0]
+
+        # L1 损失
+        self.l1_loss = nn.L1Loss()
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+        训练前向：直接回归未来动作（无扩散）。
+
+        Flow:
+          1. Build QwenVL inputs (images + instruction tokens)
+          2. Extract hidden states from configured layer range
+          7. Predict action and compute L1 loss
+
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+            **kwargs: Reserved.
+
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B， len, 7]
+        
+        # step 0: add special action token to instruction
+        action_tokens = self.action_token* self.chunk_len #can't add " " between two tokens, otherwise will be tokenized to multiple tokens
+        prompt_suffix = f" Please predict the next {self.chunk_len} robot actions: <action>{action_tokens}<action>."
+        instructions = [instruction + prompt_suffix for instruction in instructions]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]   # [B, L, H]
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            # 提取动作 token embedding 作为动作预测查询
+            input_ids = qwen_inputs.get("input_ids", None)
+            action_queries = self._gather_action_token_embeddings(last_hidden, input_ids, action_token_id=self.action_token_id)  # [B, chunk_len, H]
+            pred_actions = self.action_model.predict_action(action_queries)  # (B, chunk_len, action_dim)
+
+            # 标签对齐：取最后 chunk_len 段
+            actions = torch.tensor(
+                np.array(actions), device=pred_actions.device, dtype=pred_actions.dtype
+            )  # [B, T_full, action_dim]
+            actions_target = actions[:, -(self.future_action_window_size+1):, :]  # (B, chunk_len, action_dim)
+
+            # 计算 L1 损失
+            action_loss = self.l1_loss(pred_actions, actions_target)
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action(
+        self,
+        examples: List[dict] = None,
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+    
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+    
+        # step 0: add special action token to instruction
+        action_tokens = self.action_token* self.chunk_len #can't add " " between two tokens, otherwise will be tokenized to multiple tokens
+        prompt_suffix = f" Please predict the next {self.chunk_len} robot actions: <action>{action_tokens}<action>."
+        instructions = [instruction + prompt_suffix for instruction in instructions]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # last_hidden_state: [B, seq_len, H]
+            last_hidden = qwenvl_outputs.hidden_states[-1]   # [B, L, H]
+
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            # 提取动作 token embedding 作为动作预测查询
+            input_ids = qwen_inputs.get("input_ids", None)
+            action_queries = self._gather_action_token_embeddings(last_hidden, input_ids, action_token_id=self.action_token_id)  # [B, chunk_len, H]
+            pred_actions = self.action_model.predict_action(action_queries)  # (B, chunk_len, action_dim)
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        return {"normalized_actions": normalized_actions}
+
+    def _gather_action_token_embeddings(
+        self,
+        last_hidden: torch.Tensor,   # [B, L, H]
+        input_ids: torch.Tensor,     # [B, L]
+        action_token_id=None,        # 可为 int 或 List[int]
+    ) -> torch.Tensor:
+        """
+        向量化批量提取动作 token embedding:
+          - 不再逐样本 for 循环
+          - 取每个样本里最靠后的 chunk_len 个动作占位 token
+        Args:
+            last_hidden: [B, L, H]
+            input_ids:   [B, L]
+            action_token_id: int 或 List[int]
+        Returns:
+            action_queries: [B, chunk_len, H]
+        """
+        if action_token_id is None:
+            raise ValueError("action_token_id 不能为空")
+
+        device = input_ids.device
+        B, L, H = last_hidden.shape
+
+        # 支持多 id（如多个变体）
+        if isinstance(action_token_id, (list, tuple, set)):
+            id_list = torch.tensor(list(action_token_id), device=device, dtype=input_ids.dtype)
+            # torch.isin 需要 PyTorch >=1.10
+            mask = torch.isin(input_ids, id_list)
+        else:
+            mask = (input_ids == action_token_id)  # [B, L]
+
+        counts = mask.sum(dim=1)  # [B]
+        if (counts < self.chunk_len).any():
+            insufficient = (counts < self.chunk_len).nonzero(as_tuple=False).flatten().tolist()
+            raise RuntimeError(
+                f"以下样本动作 token 数量不足 {self.chunk_len}: {insufficient} | counts={counts.tolist()}"
+            )
+
+        # 位置索引
+        idx = torch.arange(L, device=device).unsqueeze(0).expand(B, L)  # [B, L]
+        masked_pos = torch.where(mask, idx, torch.full_like(idx, -1))   # 非动作位置置 -1
+
+        # 取最后 chunk_len 个（索引大的在序列靠后）
+        # 注意: 已确保数量足够，不会出现 -1 被错误选中的问题
+        topk_pos = masked_pos.topk(k=self.chunk_len, dim=-1).values     # [B, chunk_len] 未排序
+        # 时间顺序排序
+        selected_pos = topk_pos.sort(dim=-1).values                     # [B, chunk_len]
+
+        # Gather
+        expanded_index = selected_pos.unsqueeze(-1).expand(-1, -1, H)   # [B, chunk_len, H]
+        action_queries = last_hidden.gather(dim=1, index=expanded_index)  # [B, chunk_len, H]
+        return action_queries
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    cfg.framework.action_model.action_hidden_dim = 2048
+
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Florence-2-large"
+    
+
+    # try get model
+    model = Qwenvl_OFT(cfg)
+    print(model)
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image], # two views
+        "lang": "This is a fake instruction for testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    sample2 = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image], # two views
+        "lang": "For testing.",
+        # "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample2]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+
+    # try forward model
+    # can be fake sample， but here get from dataloader for simpler
+    from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    vla_dataset_cfg = cfg.datasets.vla_data
+    dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    from torch.utils.data import DataLoader
+
+    train_dataloader = DataLoader(
+        dataset,
+        batch_size=2,
+        num_workers=1,  # For Debug
+        collate_fn=collate_fn,
+    )
+    # zhe
+    for batch in tqdm(train_dataloader, desc="Processing Batches"):
+        batch
+        break
+
+    # try get model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    model(batch)
+    pass
+    action = model.predict_action(batch)
\ No newline at end of file
diff --git a/code/model/framework/QwenPI.py b/code/model/framework/QwenPI.py
new file mode 100644
index 0000000000000000000000000000000000000000..a608d65c3a68d7027fbe474d1dc0bc01b99e8bac
--- /dev/null
+++ b/code/model/framework/QwenPI.py
@@ -0,0 +1,368 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by Jinhui YE / HKUST University] in [2025].
+"""
+Qwen-GROOT Framework
+A lightweight implementation that Qwen2.5-vl + Flow-matching head to directly predict continuous actions
+Flow-matching header is copyright from GR00T N1.5, but a sample MoE inspired by PI_0
+"""
+from typing import List
+from tqdm import tqdm
+from typing import List, Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from PIL import Image
+
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+from deployment.model_server.tools.image_tools import to_pil_preserve
+
+logger = initialize_overwatch(__name__)
+
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+
+from starVLA.model.framework.base_framework import baseframework
+from starVLA.model.modules.vlm import get_vlm_model
+from starVLA.model.modules.action_model.LayerwiseFM_ActionHeader import get_action_model, LayerwiseFlowmatchingActionHead
+from starVLA.training.trainer_utils.trainer_tools import resize_images
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+####################################################
+# ⚠️ Warning: This framework has been restructured and is NOT compatible with checkpoints created before 2025-10-20.
+####################################################
+
+@FRAMEWORK_REGISTRY.register("QwenPI")
+class Qwen_PI(baseframework):
+    """
+    Multimodal vision-language-action model.
+
+    Components:
+      - Qwen2.5 VL interface for fused language/vision token embeddings
+      - Layer-wise cross DiT diffusion head
+
+    Focus: Predict future continuous actions conditioned on images + instruction.
+    """
+
+    def __init__(
+        self,
+        config: Optional[dict] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Construct all submodules and cache key configuration values.
+
+        Args:
+            config: Hierarchical configuration (OmegaConf/dict) containing framework + trainer sections.
+            **kwargs: Reserved for future overrides (unused).
+        """
+
+        super().__init__()
+        self.config = config
+        self.qwen_vl_interface = get_vlm_model(config=self.config)
+
+        # dynamic get llm config from actual model (do NOT hardcode num_vl_layers)
+        # Qwen3VLConfig nests language config under text_config; fall back to top-level for older models
+        _model_cfg = self.qwen_vl_interface.model.config
+        _text_cfg = getattr(_model_cfg, "text_config", _model_cfg)
+        llm_hidden_size = _text_cfg.hidden_size
+        num_vl_layers = _text_cfg.num_hidden_layers
+        self.llm_hidden_size = llm_hidden_size
+        self.config.framework.qwenvl.vl_hidden_dim = llm_hidden_size
+        self.config.framework.qwenvl.num_vl_layers = num_vl_layers
+
+        self.action_model: LayerwiseFlowmatchingActionHead = get_action_model(config=self.config)
+
+        self.future_action_window_size = config.framework.action_model.future_action_window_size
+        self.past_action_window_size = config.framework.action_model.past_action_window_size
+        self.chunk_len = self.past_action_window_size + 1 + self.future_action_window_size
+
+        # Dataset soft prompt: conditions VLM on dataset identity
+        self.dataset_vocab_size = getattr(self.config.framework.action_model, "dataset_vocab_size", 256)
+        self.num_data_tokens = getattr(self.config.framework.qwenvl, "num_data_tokens", 0)
+        if self.num_data_tokens > 0:
+            self.dataset_embed = nn.Embedding(
+                self.dataset_vocab_size,
+                llm_hidden_size * self.num_data_tokens,
+            )
+        
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs,
+    ) -> Tuple:
+        """
+        Args:
+            examples: List[dict], each dict requires:
+                - image: List[PIL.Image] (multi-view)
+                - lang: str instruction
+                - action: np.ndarray or list shaped [T, action_dim]
+        Returns:
+            dict:
+                action_loss (torch.Tensor): Scalar diffusion noise prediction loss.
+        """
+        batch_images = [example["image"] for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+        actions = [example["action"] for example in examples]  # label [B， len, 7]
+
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
+                )
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            # 取与 DiT 层数匹配的最后 N 层隐藏态，按层喂给 DiT
+            all_hidden = qwenvl_outputs.hidden_states
+            expected_layers = len(self.action_model.model.transformer_blocks)
+            vl_embs_list = list(all_hidden[-expected_layers:])
+            base_hidden = vl_embs_list[-1]
+
+        # Step 4: Action Expert Forward and Loss
+        # Extract encoder_attention_mask before VLM forward (qwen_inputs still in scope).
+        # In cross-embodied training, batch sequences have very different lengths due to
+        # varying camera counts (different image token counts per environment). Without
+        # masking, the DiT cross-attention attends to padding tokens, injecting
+        # task-dependent noise that causes unstable performance across environments.
+        encoder_attention_mask = qwen_inputs.get("attention_mask", None)
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            # 标签对齐：取最后 chunk_len 段
+            actions = torch.tensor(
+                np.array(actions), device=base_hidden.device, dtype=base_hidden.dtype
+            )  # [B, T_full, action_dim]
+            actions_target = actions[:, -(self.future_action_window_size+1):, :]  # (B, chunk_len, action_dim)
+
+            repeated_diffusion_steps = (
+                self.config.trainer.get("repeated_diffusion_steps", 1) if self.config and self.config.trainer else 1
+            )
+            actions_target_repeated = actions_target.repeat(repeated_diffusion_steps, 1, 1)
+            # 对每层特征做 repeat
+            vl_embs_list_repeated = [h.repeat(repeated_diffusion_steps, 1, 1) for h in vl_embs_list]
+            encoder_attention_mask_repeated = (
+                encoder_attention_mask.repeat(repeated_diffusion_steps, 1)
+                if encoder_attention_mask is not None else None
+            )
+
+            state_repeated = None
+            if state is not None:
+                state = torch.tensor(
+                    np.array(state), device=base_hidden.device, dtype=base_hidden.dtype
+                )
+                state_repeated = state.repeat(repeated_diffusion_steps, 1, 1)
+
+            action_loss = self.action_model(
+                vl_embs_list_repeated, actions_target_repeated, state_repeated,
+                encoder_attention_mask=encoder_attention_mask_repeated,
+            )  # (B, chunk_len, action_dim)
+
+
+
+        return {"action_loss": action_loss}
+
+    @torch.inference_mode()
+    def predict_action( # TODO align  predict_action with forward, make api more flexible
+        self,
+        examples: List[dict] = None,
+        **kwargs: str,
+    ) -> np.ndarray:
+        """
+        推理：单次前向直接回归未来动作（无扩散采样）。
+
+        Steps:
+          1. Resize images to training resolution (if specified)
+          2. Encode with QwenVL (hidden states retained)
+          6. Return normalized action trajectory
+
+        Returns:
+            dict:
+                normalized_actions (np.ndarray): Shape [B, T, action_dim], diffusion-sampled normalized actions.
+        """
+        from deployment.model_server.tools.image_tools import to_pil_preserve
+        batch_images = [to_pil_preserve(example["image"]) for example in examples]  #  [B，[PLT]]
+        instructions = [example["lang"] for example in examples]  # [B, str]
+
+        state = [example["state"] for example in examples] if "state" in examples[0] else None  # [B, 1, state_dim]
+        dataset_ids = [example.get("dataset_id", 0) for example in examples]
+
+        train_obs_image_size = getattr(self.config.datasets.vla_data, "image_size", None)
+        if train_obs_image_size:
+            batch_images = resize_images(batch_images, target_size=train_obs_image_size)
+
+        # Step 1: QWenVL input format
+        qwen_inputs = self.qwen_vl_interface.build_qwenvl_inputs(images=batch_images, instructions=instructions)
+
+        # Prepend dataset soft prompt tokens to VLM inputs
+        if self.num_data_tokens > 0 and "input_ids" in qwen_inputs:
+            dataset_ids_tensor = torch.tensor(
+                dataset_ids, device=qwen_inputs["input_ids"].device, dtype=torch.long
+            )
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                len(dataset_ids), self.num_data_tokens, self.llm_hidden_size
+            )
+            token_embeds = self.qwen_vl_interface.model.get_input_embeddings()(qwen_inputs["input_ids"])
+            qwen_inputs["inputs_embeds"] = torch.cat((ds_embeds, token_embeds), dim=1)
+            qwen_inputs.pop("input_ids")
+            if "attention_mask" in qwen_inputs:
+                prefix_mask = torch.ones(
+                    (qwen_inputs["attention_mask"].shape[0], self.num_data_tokens),
+                    device=qwen_inputs["attention_mask"].device,
+                    dtype=qwen_inputs["attention_mask"].dtype,
+                )
+                qwen_inputs["attention_mask"] = torch.cat(
+                    (prefix_mask, qwen_inputs["attention_mask"]), dim=1
+                )
+            if "position_ids" in qwen_inputs:
+                prefix_pos = torch.arange(
+                    self.num_data_tokens,
+                    device=qwen_inputs["position_ids"].device,
+                    dtype=qwen_inputs["position_ids"].dtype,
+                ).unsqueeze(0).expand(qwen_inputs["position_ids"].shape[0], -1)
+                qwen_inputs["position_ids"] = torch.cat(
+                    (prefix_pos, qwen_inputs["position_ids"] + self.num_data_tokens), dim=1
+                )
+
+        encoder_attention_mask = qwen_inputs.get("attention_mask", None)
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            qwenvl_outputs = self.qwen_vl_interface(
+                **qwen_inputs,
+                output_attentions=False,
+                output_hidden_states=True,
+                return_dict=True,
+            )
+            all_hidden = qwenvl_outputs.hidden_states
+            expected_layers = len(self.action_model.model.transformer_blocks)
+            vl_embs_list = list(all_hidden[-expected_layers:])
+            base_hidden = vl_embs_list[-1]
+
+        state = torch.from_numpy(np.array(state)).to(base_hidden.device, dtype=base_hidden.dtype) if state is not None else None
+        # Step 4: Action Expert Forward and Loss
+        with torch.autocast("cuda", dtype=torch.float32):
+            pred_actions = self.action_model.predict_action(
+                vl_embs_list, state, encoder_attention_mask=encoder_attention_mask
+            )  # (B, chunk_len, action_dim)
+
+        normalized_actions = pred_actions.detach().cpu().numpy()
+        return {"normalized_actions": normalized_actions}
+
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # try get model
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct"
+    
+
+    model = Qwen_PI(cfg)
+    # ckpt="/mnt/petrelfs/yejinhui/Projects/llavavla/results/Checkpoints/1011_qwenpi/checkpoints/need_steps_10000_pytorch_model.pt"
+    # model = Qwen_PI.from_pretrained(ckpt)
+    print(model)
+
+
+    # fake sample 
+    image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
+    # Create a sample
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 7)).astype(np.float16), # action_chunk, action_dim
+        "image": [image, image], # two views
+        "lang": "This is a fake instruction for testing.",
+        "state" : np.random.uniform(-1, 1, size=(1, 7)).astype(np.float16), # chunk, state_dim
+    }
+
+    batch  = [sample, sample]  # batch size 2
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = model.to(device)
+    forward_output = model(batch)
+    action_loss = forward_output['action_loss']
+    print(f"Action Loss: {action_loss.item()}")
+
+    # test predict action
+    predict_output = model.predict_action([sample])
+    normalized_actions = predict_output['normalized_actions']
+    print(f"Unnormalized Action: {normalized_actions}")
+
+    # # Advance: try forward model with dataloader
+    # # can be fake sample， but here get from dataloader for simpler
+    # from starVLA.dataloader.lerobot_datasets import get_vla_dataset, collate_fn
+
+    # vla_dataset_cfg = cfg.datasets.vla_data
+    # dataset = get_vla_dataset(data_cfg=vla_dataset_cfg)
+
+    # from torch.utils.data import DataLoader
+
+    # train_dataloader = DataLoader(
+    #     dataset,
+    #     batch_size=2,
+    #     num_workers=1,  # For Debug
+    #     collate_fn=collate_fn,
+    # )
+    # # 
+    # for batch in tqdm(train_dataloader, desc="Processing Batches"):
+    #     batch
+    #     break
+
+    # # try get model
+    # device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    # model = model.to(device)
+    # model(batch)
+
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]])
+
+    # # fake state
+    # for ba in batch:
+    #     ba["state"] = ba["action"][0][None]
+
+    # model(batch)
+    # action = model.predict_action(batch_images=[batch[0]["image"]], instructions=[batch[0]["lang"]], state=[batch[0]["state"]])
diff --git a/code/model/framework/__init__.py b/code/model/framework/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff5974dd92b6c249fac189fdec26c3c553a350ec
--- /dev/null
+++ b/code/model/framework/__init__.py
@@ -0,0 +1,79 @@
+"""
+Framework factory utilities.
+Automatically builds registered framework implementations
+based on configuration.
+
+Each framework module (e.g., M1.py, QwenFast.py) should register itself:
+    from starVLA.model.framework.framework_registry import FRAMEWORK_REGISTRY
+
+    @FRAMEWORK_REGISTRY.register("InternVLA-M1")
+    def build_model_framework(config):
+        return InternVLA_M1(config=config)
+"""
+
+import pkgutil
+import importlib
+from omegaconf import OmegaConf
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+from starVLA.training.trainer_utils import initialize_overwatch
+
+logger = initialize_overwatch(__name__)
+
+try:
+    pkg_path = __path__
+except NameError:
+    pkg_path = None
+
+# Auto-import all framework submodules to trigger registration
+if pkg_path is not None:
+    for _, module_name, _ in pkgutil.iter_modules(pkg_path):
+        if module_name.startswith("_"):
+            continue
+        try:
+            importlib.import_module(f"{__name__}.{module_name}")
+        except Exception as e:
+            logger.warning(f"Failed to auto-import framework submodule `{module_name}`: {e}")
+        
+def build_framework(cfg):
+    """
+    Build a framework model from config.
+    Args:
+        cfg: Config object (OmegaConf / namespace) containing:
+             cfg.framework.name: Identifier string (e.g. "InternVLA-M1")
+    Returns:
+        nn.Module: Instantiated framework model.
+    """
+
+    if not hasattr(cfg, "framework"):
+        raise ValueError("Missing `cfg.framework` in configuration.")
+
+    framework_id = getattr(cfg.framework, "name", None)
+    if not framework_id:
+        framework_id = getattr(cfg.framework, "framework_py", None)  # Backward compatibility for legacy config yaml
+        if framework_id:
+            cfg.framework.name = framework_id
+
+    if not framework_id:
+        raise ValueError("Missing framework identifier. Set `cfg.framework.name` (or legacy `framework_py`).")
+
+    if framework_id == "ActionModelFM":
+        from starVLA.model.modules.action_model.ActionModel_FM import ActionModelFM
+        from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+        action_model_cfg = getattr(cfg.framework, "action_model", None)
+        if action_model_cfg is None:
+            config = ActionModelConfig()
+        else:
+            action_model_kwargs = OmegaConf.to_container(action_model_cfg, resolve=True)
+            config = ActionModelConfig(**action_model_kwargs)
+        return ActionModelFM(config)
+    elif framework_id == "QwenLatent":
+        from starVLA.model.framework.QwenLatent import QwenLatent
+        return QwenLatent(cfg)
+    elif framework_id == "QwenLatent_history":
+        from starVLA.model.framework.QwenLatent_history import QwenLatentHistory
+        return QwenLatentHistory(cfg)
+    model_class = FRAMEWORK_REGISTRY[framework_id]
+    return model_class(cfg)
+
+__all__ = ["build_framework", "FRAMEWORK_REGISTRY"]
diff --git a/code/model/framework/__pycache__/LangForce.cpython-310.pyc b/code/model/framework/__pycache__/LangForce.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7cdb59623f455e1d4f9940902edaf5d202cf42d
Binary files /dev/null and b/code/model/framework/__pycache__/LangForce.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/LangForce.cpython-311.pyc b/code/model/framework/__pycache__/LangForce.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c46a08ef1c64ea8f7e36f87c3996c534be5a181
Binary files /dev/null and b/code/model/framework/__pycache__/LangForce.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/M1.cpython-310.pyc b/code/model/framework/__pycache__/M1.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13ac8b5600ce2c9d4cc7cd0d84f8f17b6834cbdc
Binary files /dev/null and b/code/model/framework/__pycache__/M1.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/M1.cpython-311.pyc b/code/model/framework/__pycache__/M1.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8451725070d0b045af8d3311de7754299a4394
Binary files /dev/null and b/code/model/framework/__pycache__/M1.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/PI0.cpython-310.pyc b/code/model/framework/__pycache__/PI0.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..927a31e6853e30939ea92ff5a308197595709414
Binary files /dev/null and b/code/model/framework/__pycache__/PI0.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/PI0.cpython-311.pyc b/code/model/framework/__pycache__/PI0.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2754f3150f35c4cded6a70795aea71687b900256
Binary files /dev/null and b/code/model/framework/__pycache__/PI0.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenAdapter.cpython-310.pyc b/code/model/framework/__pycache__/QwenAdapter.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3af3ebd25c92dcc8860fb3ccd151de713c3bdfb8
Binary files /dev/null and b/code/model/framework/__pycache__/QwenAdapter.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenAdapter.cpython-311.pyc b/code/model/framework/__pycache__/QwenAdapter.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77b87a3e008871034149420ee5bfb63b7910cc97
Binary files /dev/null and b/code/model/framework/__pycache__/QwenAdapter.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenDual.cpython-310.pyc b/code/model/framework/__pycache__/QwenDual.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1f8f937bd554494dae13f6a9080804b8db4ee38
Binary files /dev/null and b/code/model/framework/__pycache__/QwenDual.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenDual.cpython-311.pyc b/code/model/framework/__pycache__/QwenDual.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ed89a7c27347e02446c14891c0fd92ff7b9c702
Binary files /dev/null and b/code/model/framework/__pycache__/QwenDual.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenFast.cpython-310.pyc b/code/model/framework/__pycache__/QwenFast.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a34484b1d7d4a65d17bee1339230b625607e1e7
Binary files /dev/null and b/code/model/framework/__pycache__/QwenFast.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenFast.cpython-311.pyc b/code/model/framework/__pycache__/QwenFast.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..381dcc570031b53ce0f9a9bbeb718a3ff299a606
Binary files /dev/null and b/code/model/framework/__pycache__/QwenFast.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenGR00T.cpython-310.pyc b/code/model/framework/__pycache__/QwenGR00T.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dfb61e9e12a89f0fe134190384c00ecce6459a3
Binary files /dev/null and b/code/model/framework/__pycache__/QwenGR00T.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenGR00T.cpython-311.pyc b/code/model/framework/__pycache__/QwenGR00T.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fe56d766999ca0755d7d6353dc7cb5fde96a6eb
Binary files /dev/null and b/code/model/framework/__pycache__/QwenGR00T.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenGR00T_moh.cpython-310.pyc b/code/model/framework/__pycache__/QwenGR00T_moh.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7261c772e4c43e3b3096cf06a058030d87a53e35
Binary files /dev/null and b/code/model/framework/__pycache__/QwenGR00T_moh.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent copy.cpython-310.pyc b/code/model/framework/__pycache__/QwenLatent copy.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b058fa7170bad3edaaa0475f58661deea0154ee6
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent copy.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent.cpython-310.pyc b/code/model/framework/__pycache__/QwenLatent.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..888a57fcfd0fd17de09470c7d897d2c65b5378a4
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent.cpython-311.pyc b/code/model/framework/__pycache__/QwenLatent.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84c96ad9f3397a54b9063726d8d7686af88d9f28
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent_history.cpython-310.pyc b/code/model/framework/__pycache__/QwenLatent_history.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2648d5ffe97bbe16365f9546a18b8e596c4fa90e
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent_history.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent_history.cpython-311.pyc b/code/model/framework/__pycache__/QwenLatent_history.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a2bdb685b6f8a136f193d12260425b570f5074a8
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent_history.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenLatent_history_naive.cpython-310.pyc b/code/model/framework/__pycache__/QwenLatent_history_naive.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bae7eae0acf70df99d376a4300618af660bb7a7
Binary files /dev/null and b/code/model/framework/__pycache__/QwenLatent_history_naive.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenOFT.cpython-310.pyc b/code/model/framework/__pycache__/QwenOFT.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..98030cb91a49677b37f9a9a5e8ba0dc0eda03bb0
Binary files /dev/null and b/code/model/framework/__pycache__/QwenOFT.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenOFT.cpython-311.pyc b/code/model/framework/__pycache__/QwenOFT.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5d576e85c433833e3b1567954b0d1815a2612fc1
Binary files /dev/null and b/code/model/framework/__pycache__/QwenOFT.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/QwenPI.cpython-310.pyc b/code/model/framework/__pycache__/QwenPI.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..161fcaabbb435cfe38e81c4ca855c876df2cc4f6
Binary files /dev/null and b/code/model/framework/__pycache__/QwenPI.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/QwenPI.cpython-311.pyc b/code/model/framework/__pycache__/QwenPI.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ac54f6717e3ef59052738b816461201cd1bf01c
Binary files /dev/null and b/code/model/framework/__pycache__/QwenPI.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/__init__.cpython-310.pyc b/code/model/framework/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f8ee1fe5e9db99b155c69936c4292cec00a0b06
Binary files /dev/null and b/code/model/framework/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/__init__.cpython-311.pyc b/code/model/framework/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2ec2e00d7253faf27347578c7497db8328cc688
Binary files /dev/null and b/code/model/framework/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/base_framework.cpython-310.pyc b/code/model/framework/__pycache__/base_framework.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..599207b62ba2f1489b507c002e012c8c52bbed4d
Binary files /dev/null and b/code/model/framework/__pycache__/base_framework.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/base_framework.cpython-311.pyc b/code/model/framework/__pycache__/base_framework.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02c9f779436079a471c9937773789e2949e91e80
Binary files /dev/null and b/code/model/framework/__pycache__/base_framework.cpython-311.pyc differ
diff --git a/code/model/framework/__pycache__/share_tools.cpython-310.pyc b/code/model/framework/__pycache__/share_tools.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bb77ee388a8a9a1a386b0a5e4c0bd18729a807a
Binary files /dev/null and b/code/model/framework/__pycache__/share_tools.cpython-310.pyc differ
diff --git a/code/model/framework/__pycache__/share_tools.cpython-311.pyc b/code/model/framework/__pycache__/share_tools.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80dd1a3fd7fd8e11ebd1d68d92586e6d86e06e59
Binary files /dev/null and b/code/model/framework/__pycache__/share_tools.cpython-311.pyc differ
diff --git a/code/model/framework/base_framework.py b/code/model/framework/base_framework.py
new file mode 100644
index 0000000000000000000000000000000000000000..3943921f35dcab44c7285c8e4aea82a9ca1001cc
--- /dev/null
+++ b/code/model/framework/base_framework.py
@@ -0,0 +1,240 @@
+"""
+Base framework abstraction providing:
+- Pretrained loading (config + normalization stats + weights)
+- Action space utilities (dimension, stats, (un)normalization)
+- Trainable module discovery helper
+Note: No device placement or optimizer concerns handled here (delegated to trainer).
+"""
+
+import torch.nn as nn
+from typing import List
+
+from pathlib import Path
+
+import torch
+import torch.nn as nn
+import numpy as np
+
+from typing import List
+
+from pathlib import Path
+from typing import Dict, List
+from transformers import AutoConfig, AutoModel, PretrainedConfig, PreTrainedModel
+import numpy as np
+from starVLA.model.tools import auto_get_trainable_modules
+
+from starVLA.model.framework.share_tools import read_mode_config
+from starVLA.training.trainer_utils import initialize_overwatch
+from starVLA.model.framework.share_tools import dict_to_namespace
+from starVLA.model.framework.__init__ import build_framework
+
+logger = initialize_overwatch(__name__)
+
+
+# PreTrainedModel, AutoModel, PretrainedConfig,  are so good, find sometime to study them
+# TODO @JinhuiYE find sometime to merge yaml config with transformer config
+
+class baseframework(PreTrainedModel):
+    """
+    Lightweight base class for higher-level VLA model assemblies.
+    Subclasses are expected to:
+      - Accept a structured config
+      - Register components in __init__
+      - Use provided helpers for action normalization handling
+    """
+
+    def __init__(
+        self,
+        hf_config = PretrainedConfig()
+    ) -> None:
+        """
+        Initialize base nn.Module. Subclasses add components.
+        """
+        
+        super().__init__(hf_config)
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_checkpoint: str,
+        **kwargs,
+    ) -> None:
+        """
+        Restore a model instance from a saved checkpoint.
+
+        Workflow:
+            1. Resolve checkpoint path
+            2. Load config + dataset normalization statistics
+            3. Build model with loaded config
+            4. Load state_dict strictly (reports missing/unexpected keys)
+            5. Attach normalization stats for later un-normalization
+
+        Args:
+            pretrained_checkpoint: Path to .pt file inside run/checkpoints directory.
+            **kwargs: Extra constructor overrides passed to subclass.
+
+        Returns:
+            baseframework: Instantiated model (left on CPU; caller decides device).
+
+        Raises:
+            RuntimeError: If state_dict key mismatch occurs under strict=True.
+            FileNotFoundError: If underlying files are missing (surfaced earlier).
+        """
+        pretrained_checkpoint = Path(pretrained_checkpoint)
+        model_config, norm_stats = read_mode_config(pretrained_checkpoint)  # read config and norm_stats
+
+        config = dict_to_namespace(model_config)
+        model_config = config
+        model_config.trainer.pretrained_checkpoint = None
+        # FrameworkModel = cls(config=model_config, **kwargs) # TODO find cls by config
+        FrameworkModel = build_framework(cfg=model_config)
+        # set for action un-norm
+        FrameworkModel.norm_stats = norm_stats
+        # Load from Checkpoint - support both safetensors and pt formats
+        if pretrained_checkpoint.suffix == ".safetensors":
+            from safetensors.torch import load_file
+
+            model_state_dict = load_file(str(pretrained_checkpoint))
+        else:
+            model_state_dict = torch.load(pretrained_checkpoint, map_location="cpu")
+        model_keys = set(FrameworkModel.state_dict().keys())
+        checkpoint_keys = set(model_state_dict.keys())
+        try:
+            FrameworkModel.load_state_dict(model_state_dict, strict=True)
+        except RuntimeError as e:
+            # must keep all keys matched
+            common_keys = model_keys.intersection(checkpoint_keys)
+            missing_keys = model_keys - common_keys
+            unexpected_keys = checkpoint_keys - common_keys
+            if missing_keys:
+                logger.warning(f"Missing keys in state_dict: {missing_keys}")
+            if unexpected_keys:
+                logger.warning(f"Unexpected keys in state_dict: {unexpected_keys}")
+
+            raise e
+
+        # **ensure model is on GPU**
+        FrameworkModel = FrameworkModel
+        return FrameworkModel
+
+    @staticmethod
+    def _check_unnorm_key(norm_stats, unnorm_key):
+        """
+        Infer or validate the dataset stats key used for un-normalization.
+
+        Args:
+            norm_stats: Dict[str, dict] mapping dataset key -> stats block.
+            unnorm_key: Optional explicit dataset key.
+
+        Returns:
+            str: Resolved key.
+
+        Raises:
+            AssertionError: If multiple datasets present and key not provided,
+                            or provided key not found.
+        """
+        if unnorm_key is None:
+            assert len(norm_stats) == 1, (
+                f"Your model was trained on more than one dataset, "
+                f"please pass a `unnorm_key` from the following options to choose the statistics "
+                f"used for un-normalizing actions: {norm_stats.keys()}"
+            )
+            unnorm_key = next(iter(norm_stats.keys()))
+
+        assert unnorm_key in norm_stats, (
+            f"The `unnorm_key` you chose is not in the set of available dataset statistics, "
+            f"please choose from: {norm_stats.keys()}"
+        )
+        return unnorm_key
+
+    @classmethod
+    def get_action_stats(self, unnorm_key=None):
+        """
+        Retrieve raw action normalization statistics.
+
+        Args:
+            unnorm_key: Optional dataset stats key.
+
+        Returns:
+            dict: Stats structure (e.g. q01, q99, mask).
+        """
+        unnorm_key = self._check_unnorm_key(self.norm_stats, unnorm_key)
+        return self.norm_stats[unnorm_key]["action"]
+
+    @property
+    def trainable_module_keys(self, max_depth=1) -> List[str]:
+        """
+        Enumerate trainable submodule names up to a depth.
+
+        Args:
+            max_depth: Descent depth when traversing module tree.
+
+        Returns:
+            List[str]: Module path names considered trainable.
+        """
+        keys = auto_get_trainable_modules(self, max_depth=max_depth)  # auto check which modules are trainable
+        return keys
+
+    @staticmethod
+    def unnormalize_actions(normalized_actions: np.ndarray, action_norm_stats: Dict[str, np.ndarray]) -> np.ndarray:
+        """
+        Map normalized actions (≈[-1, 1]) back to original value range.
+
+        Steps:
+            - Clamp values to [-1, 1]
+            - Threshold channel index 6 to {0,1} (binary semantic)
+            - Apply linear scaling for masked dimensions using:
+                original = 0.5 * (norm + 1) * (q99 - q01) + q01
+
+        Args:
+            normalized_actions: Array shape [T, D] (or chunk length × action_dim).
+            action_norm_stats: Dict containing:
+                q01 (array-like): Lower percentile (per-dimension).
+                q99 (array-like): Upper percentile (per-dimension).
+                mask (optional bool array): True => apply de-normalization; False => keep original normalized value.
+
+        Returns:
+            np.ndarray: Unnormalized actions (same shape as input).
+        """
+        mask = action_norm_stats.get("mask", np.ones_like(action_norm_stats["q01"], dtype=bool))
+        action_high, action_low = np.array(action_norm_stats["q99"]), np.array(action_norm_stats["q01"])
+        normalized_actions = np.clip(normalized_actions, -1, 1)
+        normalized_actions[:, 6] = np.where(normalized_actions[:, 6] < 0.5, 0, 1)
+        actions = np.where(
+            mask,
+            0.5 * (normalized_actions + 1) * (action_high - action_low) + action_low,
+            normalized_actions,
+        )
+
+        return actions
+
+    @staticmethod
+    def _check_unnorm_key(norm_stats, unnorm_key):
+        """
+        Duplicate helper (retained for backward compatibility).
+        See primary _check_unnorm_key above.
+        """
+        if unnorm_key is None:
+            assert len(norm_stats) == 1, (
+                f"Your model was trained on more than one dataset, "
+                f"please pass a `unnorm_key` from the following options to choose the statistics "
+                f"used for un-normalizing actions: {norm_stats.keys()}"
+            )
+            unnorm_key = next(iter(norm_stats.keys()))
+
+        assert unnorm_key in norm_stats, (
+            f"The `unnorm_key` you chose is not in the set of available dataset statistics, "
+            f"please choose from: {norm_stats.keys()}"
+        )
+        return unnorm_key
+
+    @classmethod
+    def get_action_stats(self, unnorm_key=None, norm_stats=None):
+        """
+        Duplicate stats accessor (retained for backward compatibility).
+        # in future, it will own to policy interface and pack as 
+        """
+        if norm_stats ==None:
+            norm_stats = self.norm_stats
+        unnorm_key = self._check_unnorm_key(norm_stats, unnorm_key)
+        return norm_stats[unnorm_key]["action"]
diff --git a/code/model/framework/share_tools.py b/code/model/framework/share_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4fd90cfe8570f3d18f8990b8824cf92f6658590
--- /dev/null
+++ b/code/model/framework/share_tools.py
@@ -0,0 +1,299 @@
+"""
+Shared configuration / utility helpers for framework components:
+- NamespaceWithGet: lightweight namespace behaving like a dict
+- OmegaConf conversion helpers
+- Config merging decorator for model __init__
+- Checkpoint config/statistics loading
+"""
+
+import os
+from pathlib import Path
+from types import SimpleNamespace
+import json
+
+from typing import Union, List
+import torchvision
+import os
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+from types import SimpleNamespace
+import torch, json
+import torch.nn as nn
+import numpy as np
+from PIL import Image
+import re
+from omegaconf import OmegaConf
+from types import SimpleNamespace
+import inspect
+import functools
+from typing import Any
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+
+# Initialize Overwatch =>> Wraps `logging.Logger`
+overwatch = initialize_overwatch(__name__)
+
+
+from types import SimpleNamespace
+
+
+class NamespaceWithGet(SimpleNamespace):
+    def get(self, key, default=None):
+        """
+        Return attribute value if present, else default (dict-like API).
+
+        Args:
+            key: Attribute name.
+            default: Fallback if attribute missing.
+
+        Returns:
+            Any: Stored value or default.
+        """
+        return getattr(self, key, default)
+
+    def items(self):
+        """
+        Iterate (key, value) pairs like dict.items().
+
+        Returns:
+            Generator[Tuple[str, Any], None, None]
+        """
+        return ((key, getattr(self, key)) for key in self.__dict__)
+
+    def __iter__(self):
+        """
+        Return iterator over attribute keys (enables dict unpacking **obj).
+
+        Returns:
+            Iterator[str]
+        """
+        return iter(self.__dict__)
+
+    def to_dict(self):
+        """
+        Recursively convert nested NamespaceWithGet objects into plain dicts.
+
+        Returns:
+            dict: Fully materialized dictionary structure.
+        """
+        return {key: value.to_dict() if isinstance(value, NamespaceWithGet) else value for key, value in self.items()}
+
+
+def dict_to_namespace(d):
+    """
+    Create an OmegaConf config from a plain dictionary.
+
+    Args:
+        d: Input dictionary.
+
+    Returns:
+        OmegaConf: DictConfig instance.
+    """
+    return OmegaConf.create(d)
+
+
+def _to_omegaconf(x: Any):
+    """
+    Convert diverse input types into an OmegaConf object.
+
+    Accepted types:
+        - None -> empty DictConfig
+        - str path -> load YAML/JSON via OmegaConf.load
+        - dict -> DictConfig
+        - DictConfig / ListConfig -> returned unchanged
+        - NamespaceWithGet / SimpleNamespace -> converted via vars()/to_dict()
+
+    Args:
+        x: Input candidate.
+
+    Returns:
+        OmegaConf: Normalized configuration node.
+    """
+    if x is None:
+        return OmegaConf.create({})
+    if isinstance(x, OmegaConf.__class__):  # fallback, typically not hit
+        return x
+    try:
+        # OmegaConf node detection
+        from omegaconf import DictConfig, ListConfig
+
+        if isinstance(x, (DictConfig, ListConfig)):
+            return x
+    except Exception:
+        pass
+
+    if isinstance(x, str):
+        # treat as path
+        return OmegaConf.load(x)
+    if isinstance(x, dict):
+        return OmegaConf.create(x)
+    if isinstance(x, NamespaceWithGet) or isinstance(x, SimpleNamespace):
+        # convert to plain dict
+        try:
+            d = x.to_dict() if hasattr(x, "to_dict") else vars(x)
+        except Exception:
+            d = vars(x)
+        return OmegaConf.create(d)
+    # fallback: try to create
+    return OmegaConf.create(x)
+
+
+def merge_pram_config(init):
+    """
+    Decorator for __init__ to unify config handling.
+
+    Behavior:
+        1. Extract 'config' kwarg / arg (path | dict | OmegaConf | namespace)
+        2. Convert to OmegaConf
+        3. Merge with explicitly passed init parameters (explicit overrides file)
+        4. Attach merged config to self.config
+        5. Call original __init__ with merged config
+
+    Args:
+        init: Original __init__ function.
+
+    Returns:
+        Wrapped initializer.
+    """
+
+    @functools.wraps(init)
+    def wrapper(self, *args, **kwargs):
+        # Map positional args to parameter names (excluding self)
+        sig = inspect.signature(init)
+        param_names = [name for i, (name, p) in enumerate(sig.parameters.items()) if i > 0]
+
+        init_kwargs = {}
+        for name, val in zip(param_names, args):
+            init_kwargs[name] = val
+        # override with explicit kwargs
+        init_kwargs.update(kwargs)
+
+        # get provided config (if any)
+        provided_config = init_kwargs.get("config", None)
+
+        loaded_cfg = _to_omegaconf(provided_config)
+
+        # build params cfg from explicit init args (other than config)
+        params = {k: v for k, v in init_kwargs.items() if k != "config"}
+        params_cfg = OmegaConf.create(params) if params else OmegaConf.create({})
+
+        # merge: loaded_cfg <- params_cfg (params override file)
+        merged = OmegaConf.merge(loaded_cfg, params_cfg)
+
+        # set on instance
+        try:
+            # prefer attaching OmegaConf directly
+            self.config = merged
+        except Exception:
+            # fallback to dict
+            self.config = OmegaConf.to_container(merged, resolve=True)
+
+        # prepare kwargs for original init: ensure config is the merged OmegaConf
+        call_kwargs = dict(init_kwargs)
+        call_kwargs["config"] = merged
+
+        # call original __init__ using keyword args only (safer)
+        return init(self, **call_kwargs)
+
+    return wrapper
+
+
+_SUPPORTED_CKPT_SUFFIXES = {".pt", ".safetensors"}
+
+
+def _resolve_run_dir(checkpoint_pt):
+    """Resolve run directory from a checkpoint path.
+
+    For files in checkpoints/ (e.g. run_dir/checkpoints/model.pt), go up 2 levels.
+    """
+    return checkpoint_pt.parents[1]
+
+
+def read_model_config(pretrained_checkpoint):
+    """
+    Load global model configuration and dataset normalization statistics
+    associated with a saved checkpoint (.pt or .safetensors).
+
+    Expected directory layout:
+        <run_dir>/checkpoints/<name>.pt|.safetensors
+        <run_dir>/config.json
+        <run_dir>/dataset_statistics.json
+
+    Args:
+        pretrained_checkpoint: Path to a .pt or .safetensors checkpoint file.
+
+    Returns:
+        tuple:
+            global_cfg (dict): Loaded config.json contents.
+            norm_stats (dict): Dataset statistics for (de)normalization.
+
+    Raises:
+        FileNotFoundError: If checkpoint or required JSON files are missing.
+        AssertionError: If file suffix or structure invalid.
+    """
+    if os.path.isfile(pretrained_checkpoint):
+        checkpoint_pt = Path(pretrained_checkpoint)
+        overwatch.info(f"Loading from local checkpoint path `{checkpoint_pt}`")
+
+        assert checkpoint_pt.suffix in _SUPPORTED_CKPT_SUFFIXES, (
+            f"Unsupported checkpoint suffix `{checkpoint_pt.suffix}`, expected one of {_SUPPORTED_CKPT_SUFFIXES}"
+        )
+        run_dir = _resolve_run_dir(checkpoint_pt)
+
+        # Get paths for `config.json`, `dataset_statistics.json` and pretrained checkpoint
+        config_json, dataset_statistics_json = run_dir / "config.json", run_dir / "dataset_statistics.json"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir = }`"
+
+        with open(config_json, "r") as f:
+            global_cfg = json.load(f)
+
+        with open(dataset_statistics_json, "r") as f:
+            norm_stats = json.load(f)
+    else:
+        overwatch.error(f"❌ Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+        raise FileNotFoundError(f"Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+    return global_cfg, norm_stats
+
+
+def read_mode_config(pretrained_checkpoint):
+    """
+    Load YAML model configuration and dataset normalization statistics
+    associated with a saved checkpoint (.pt or .safetensors).
+
+    Args:
+        pretrained_checkpoint: Path to a .pt or .safetensors checkpoint file.
+
+    Returns:
+        tuple:
+            vla_cfg (dict)
+            norm_stats (dict)
+    """
+    if os.path.isfile(pretrained_checkpoint):
+        checkpoint_pt = Path(pretrained_checkpoint)
+        overwatch.info(f"Loading from local checkpoint path `{checkpoint_pt}`")
+
+        assert checkpoint_pt.suffix in _SUPPORTED_CKPT_SUFFIXES, (
+            f"Unsupported checkpoint suffix `{checkpoint_pt.suffix}`, expected one of {_SUPPORTED_CKPT_SUFFIXES}"
+        )
+        run_dir = _resolve_run_dir(checkpoint_pt)
+
+        config_yaml, dataset_statistics_json = run_dir / "config.yaml", run_dir / "dataset_statistics.json"
+        assert config_yaml.exists(), f"Missing `config.yaml` for `{run_dir = }`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir = }`"
+
+        try:
+            ocfg = OmegaConf.load(str(config_yaml))
+            global_cfg = OmegaConf.to_container(ocfg, resolve=True)
+        except Exception as e:
+            overwatch.error(f"❌ Failed to load YAML config `{config_yaml}`: {e}")
+            raise
+
+        with open(dataset_statistics_json, "r") as f:
+            norm_stats = json.load(f)
+    else:
+        overwatch.error(f"❌ Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+        raise FileNotFoundError(f"Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+    return global_cfg, norm_stats
diff --git a/code/model/modules/action_model/ActionModel.py b/code/model/modules/action_model/ActionModel.py
new file mode 100644
index 0000000000000000000000000000000000000000..caafe8f953c01d093d3be3430c2768863e0a17c6
--- /dev/null
+++ b/code/model/modules/action_model/ActionModel.py
@@ -0,0 +1,714 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/qwen3/modular_qwen3.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_qwen3.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+sys.path.append("/data2/timsty/code/LearnLatent/")
+from collections.abc import Callable
+from typing import Optional, List
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+
+import numpy as np
+
+from transformers.activations import ACT2FN
+from transformers.cache_utils import Cache, DynamicCache
+from transformers.integrations import use_kernel_forward_from_hub
+from transformers.masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
+from transformers.modeling_layers import (
+    GradientCheckpointingLayer,
+)
+
+from transformers import AutoModel, AutoModelForCausalLM
+from transformers.modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, auto_docstring, logging
+from transformers.utils.generic import check_model_inputs
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+
+logger = logging.get_logger(__name__)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class Qwen3RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps: float = 1e-6) -> None:
+        """
+        Qwen3RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen3MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: torch.Tensor | None,
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Qwen3Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: ActionModelConfig, layer_idx: int):
+        super().__init__()
+        self.layer_type = None
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.q_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # unlike olmo, only on the head dim!
+        self.k_norm = Qwen3RMSNorm(self.head_dim, eps=config.rms_norm_eps)  # thus post q_norm does not need reshape
+        self.sliding_window = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: torch.Tensor | None,
+        past_key_values: Cache | None = None,
+        cache_position: torch.LongTensor | None = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=self.sliding_window,  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class Qwen3Layer(GradientCheckpointingLayer):
+    def __init__(self, config: ActionModelConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        past_key_values: Cache | None = None,
+        use_cache: bool | None = False,
+        cache_position: torch.LongTensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class ActionPreTrainedModel(PreTrainedModel):
+    config: ActionModelConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen3Layer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Qwen3Layer,
+        "attentions": Qwen3Attention,
+    }
+
+
+class Qwen3RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: ActionModelConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class ActionModel(ActionPreTrainedModel):
+    def __init__(self, config: ActionModelConfig):
+        super().__init__(config)
+        # self.padding_idx = config.pad_token_id
+        self.config = config
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.state_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.action_mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.dataset_embed = nn.Embedding(
+            config.dataset_vocab_size,
+            config.hidden_size * config.num_data_tokens,
+        )
+
+        self.state_proj_in = nn.Linear(config.state_size, config.hidden_size)
+        self.action_proj_in = nn.Linear(config.action_size, config.hidden_size)
+        self.action_encoder = nn.ModuleList(
+            [Qwen3Layer(config, layer_idx) for layer_idx in range(config.num_encoder_layers)]
+        )
+
+        if self.config.use_vae_reparameterization:
+            self.fc_mu = nn.Linear(config.hidden_size, config.hidden_size)
+            self.fc_var = nn.Linear(config.hidden_size, config.hidden_size)
+        else:
+            # self.emb_norm = nn.LayerNorm(config.hidden_size)
+            pass
+
+        self.placeholder_tokens = nn.Parameter(torch.randn(1, config.max_action_chunk_size, config.hidden_size))
+        self.action_decoder = nn.ModuleList(
+            [Qwen3Layer(config, layer_idx) for layer_idx in range(config.num_decoder_layers)]
+        )
+        self.norm = Qwen3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.action_proj_out = nn.Linear(config.hidden_size, config.action_size)
+
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+        self._maybe_init_from_qwen3()
+
+    def _maybe_init_from_qwen3(self) -> None:
+        name_or_path = getattr(self.config, "qwen3_pretrained_name_or_path", None)
+        if not name_or_path:
+            return
+
+        pretrained = AutoModel.from_pretrained(
+            name_or_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+        )
+
+        src_sd = pretrained.state_dict()
+        layer_prefix = None
+        for p in ("model.layers.", "layers."):
+            if any(k.startswith(p) for k in src_sd.keys()):
+                layer_prefix = p
+                break
+
+        norm_prefix = None
+        for p in ("model.norm.", "norm."):
+            if any(k.startswith(p) for k in src_sd.keys()):
+                norm_prefix = p
+                break
+
+        def _map_layer_key(target_key: str, module_prefix: str, layer_offset: int) -> str | None:
+            # target_key example: "action_encoder.0.self_attn.q_proj.weight"
+            rest = target_key[len(module_prefix) + 1 :]  # "0.self_attn.q_proj.weight"
+            parts = rest.split(".", 1)
+            if len(parts) != 2:
+                return None
+            try:
+                tgt_idx = int(parts[0])
+            except ValueError:
+                return None
+            src_idx = tgt_idx + int(layer_offset)
+            return f"{layer_prefix}{src_idx}.{parts[1]}"
+
+        own_sd = self.state_dict()
+        to_load: dict[str, torch.Tensor] = {}
+        matched = 0
+        missing = 0
+        shape_mismatch = 0
+
+        init_enc = bool(getattr(self.config, "qwen3_init_action_encoder", True))
+        init_dec = bool(getattr(self.config, "qwen3_init_action_decoder", True))
+        init_norm = bool(getattr(self.config, "qwen3_init_norm", True))
+        enc_off = int(getattr(self.config, "qwen3_encoder_layer_offset", 0))
+        dec_off = int(getattr(self.config, "qwen3_decoder_layer_offset", 0))
+
+        for k, tgt_tensor in own_sd.items():
+            src_key = None
+            if init_enc and k.startswith("action_encoder."):
+                src_key = _map_layer_key(k, "action_encoder", enc_off)
+            elif init_dec and k.startswith("action_decoder."):
+                src_key = _map_layer_key(k, "action_decoder", dec_off)
+            elif init_norm and k == "norm.weight" and norm_prefix is not None:
+                src_key = f"{norm_prefix}weight"
+
+            if not src_key:
+                continue
+
+            src_tensor = src_sd.get(src_key, None)
+            if src_tensor is None:
+                missing += 1
+                continue
+
+            if src_tensor.shape != tgt_tensor.shape:
+                shape_mismatch += 1
+                continue
+
+            to_load[k] = src_tensor.to(device=tgt_tensor.device, dtype=tgt_tensor.dtype)
+            matched += 1
+
+        self.load_state_dict(to_load, strict=False)
+        print(
+            f"Initialized from Qwen3 checkpoint {name_or_path}). "
+            f"matched={matched} missing={missing} shape_mismatch={shape_mismatch} prefix={layer_prefix}"
+        )
+
+    @auto_docstring
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        device = next(self.parameters()).device
+        batch_size = len(examples)
+        # =========================================================================
+        # 1. 变长采样 (Variable-length Horizon)
+        # =========================================================================
+        max_available_len = min([len(ex["action"]) for ex in examples])
+        limit_len = min(max_available_len, self.config.max_action_chunk_size)
+        current_chunk_size = np.random.randint(self.config.min_action_len, limit_len + 1)
+
+        raw_actions = torch.tensor(
+            np.array([ex["action"][:current_chunk_size] for ex in examples]),
+            device=device, dtype=torch.float32
+        )  # Shape: [B, L, Action_Dim]
+
+        with torch.autocast("cuda", dtype=torch.float32):
+            # =========================================================================
+            # 2. State Encoding & Masking
+            # =========================================================================
+            states = [example["state"] for example in examples] if "state" in examples[0] else None
+            if states is not None:
+                states_tensor = torch.tensor(
+                    np.array(states), device=device, dtype=torch.float32
+                )
+                state_embeds = self.state_proj_in(states_tensor)
+                if self.config.state_drop_prob > 0:
+                    keep_mask = torch.bernoulli(
+                        torch.full((batch_size, 1, 1), 1 - self.config.state_drop_prob, device=device)
+                    )
+                    # 使用 learnable state_token 替换被 drop 的 state
+                    state_token_expanded = self.state_token.expand(batch_size, 1, -1)
+                    state_embeds = keep_mask * state_embeds + (1 - keep_mask) * state_token_expanded
+            else:
+                state_embeds = self.state_token.expand(batch_size, -1, -1)
+
+            # =========================================================================
+            # 3. Action Input Construction & Masking (DAE)
+            # =========================================================================
+            inputs_embeds = self.action_proj_in(raw_actions)
+            if self.config.mask_ratio > 0:
+                # 生成 Action Mask
+                # 这里的 mask 是指：True 表示被 Mask 掉 (需要被替换为 token)
+                random_matrix = torch.rand(batch_size, current_chunk_size, device=device)
+                input_mask = random_matrix < self.config.mask_ratio
+
+                # 将 mask 扩展到 hidden dim
+                input_mask_expanded = input_mask.unsqueeze(-1).float()
+
+                # 替换被 Mask 的部分
+                mask_token_expanded = self.action_mask_token.expand(batch_size, current_chunk_size, -1)
+                inputs_embeds = (1 - input_mask_expanded) * inputs_embeds + input_mask_expanded * mask_token_expanded
+
+            # =========================================================================
+            # 4. Dataset Soft Prompt (X-VLA)
+            # =========================================================================
+            dataset_ids = [ex.get("dataset_id", 0) for ex in examples]  # 默认 id 0
+            dataset_ids_tensor = torch.tensor(dataset_ids, device=device, dtype=torch.long)
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                batch_size, self.config.num_data_tokens, self.config.hidden_size
+            )  # [B, num_data_tokens, H]
+
+            # 拼接 Encoder 输入: [CLS, Dataset_Token, State, Action_1...Action_L]
+            cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)
+            encoder_inputs = torch.cat((cls_token_expanded, ds_embeds, state_embeds, inputs_embeds), dim=1)
+
+            seq_len = encoder_inputs.shape[1]
+            encoder_attention_mask = torch.ones((batch_size, 1, seq_len, seq_len), device=device, dtype=torch.bool)
+            encoder_pos_ids = torch.arange(seq_len, device=device).unsqueeze(0)
+            enc_pos_emb = self.rotary_emb(encoder_inputs, encoder_pos_ids)
+
+            hidden_states = encoder_inputs
+            for encoder_layer in self.action_encoder:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    position_embeddings=enc_pos_emb,
+                    position_ids=encoder_pos_ids,
+                    **kwargs,
+                )
+
+            # Get Latent (CLS token)
+            action_embedding = hidden_states[:, :1, :]
+
+            vae_kl_loss = None
+            if self.config.use_vae_reparameterization:
+                mu = self.fc_mu(action_embedding)
+                log_var = self.fc_var(action_embedding)
+                if self.training:
+                    std = torch.exp(log_var * 0.5)
+                    eps = torch.randn_like(std)
+                    action_embedding = mu + eps * std
+                    # KL Loss 计算
+                    kl_loss_per_sample = -0.5 * torch.sum(1 + log_var - mu.pow(2) - log_var.exp(), dim=[1, 2])
+                    vae_kl_loss = torch.mean(kl_loss_per_sample) / self.config.hidden_size
+                else:
+                    action_embedding = mu
+            
+            # L2 normalize action embedding before decoder
+            action_embedding = F.normalize(action_embedding, p=2, dim=-1)
+            
+            # =========================================================================
+            # Decoder
+            # =========================================================================
+            # Decoder Input: [Latent, Mask_1...Mask_L]
+
+            placeholder_tokens = self.placeholder_tokens[:, :current_chunk_size, :].expand(batch_size, -1, -1)
+            decoder_inputs = torch.cat((action_embedding, placeholder_tokens), dim=1)
+
+            dec_seq_len = decoder_inputs.shape[1]
+            decoder_attention_mask = torch.ones((batch_size, 1, dec_seq_len, dec_seq_len), device=device,
+                                                dtype=torch.bool)
+            dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
+            dec_pos_emb = self.rotary_emb(decoder_inputs, dec_pos_ids)
+
+            hidden_states = decoder_inputs
+            for decoder_layer in self.action_decoder:
+                hidden_states = decoder_layer(
+                    hidden_states,
+                    attention_mask=decoder_attention_mask,
+                    position_embeddings=dec_pos_emb,
+                    position_ids=dec_pos_ids,
+                )
+
+            hidden_states = self.norm(hidden_states)
+
+            reconstructed_actions = self.action_proj_out(hidden_states[:, 1:, :])
+            # recon_loss = F.mse_loss(reconstructed_actions, raw_actions)
+            recon_loss = F.l1_loss(reconstructed_actions, raw_actions)
+
+            return {
+                "recon_loss": recon_loss,
+                "vae_kl_loss": vae_kl_loss,
+            }
+
+    def recon_loss(self, actions, states=None, freeze_encoder=False, **kwargs):
+        """
+        计算重建损失
+        Args:
+            actions: 输入动作序列
+            states: 状态向量（可选）
+            freeze_encoder: 是否冻结 encoder（如果 True，则 detach embeddings，只训练 decoder）
+        """
+        action_embeddings = self.encode_actions(actions, states)
+        if freeze_encoder:
+            # detach embeddings: 只训练 decoder，不训练 encoder
+            action_embeddings = action_embeddings.detach()
+        reconstructed_actions = self.decode_actions(action_embeddings, chunk_size=actions.shape[1])
+        return F.l1_loss(reconstructed_actions, actions)
+
+    def encode_actions(self, actions, states=None, **kwargs):
+        inputs_embeds = self.action_proj_in(actions)
+        batch_size = inputs_embeds.shape[0]
+        cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)
+        states = self.state_proj_in(states) if states is not None else self.state_token.expand(batch_size, -1, -1)
+        inputs_embeds = torch.cat((cls_token_expanded, states, inputs_embeds), dim=1)
+
+        seq_len = inputs_embeds.shape[1]
+        encoder_attention_mask = torch.ones(
+            (batch_size, 1, seq_len, seq_len),
+            device=inputs_embeds.device,
+            dtype=torch.bool
+        )
+        encoder_pos_ids = torch.arange(seq_len, device=inputs_embeds.device).unsqueeze(0)
+        enc_pos_emb = self.rotary_emb(inputs_embeds, encoder_pos_ids)
+
+        hidden_states = inputs_embeds
+
+        for encoder_layer in self.action_encoder:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_embeddings=enc_pos_emb,
+                position_ids=encoder_pos_ids,
+                **kwargs,
+            )
+
+        action_embedding = hidden_states[:, :1, :]
+        if self.config.use_vae_reparameterization:
+            mu = self.fc_mu(action_embedding)
+            return F.normalize(mu, p=2, dim=-1)  # L2 normalized
+        else:
+            return F.normalize(action_embedding, p=2, dim=-1)  # L2 normalized
+
+    def decode_actions(self, action_embedding, chunk_size, **kwargs):
+        if chunk_size is None:
+            chunk_size = self.config.max_action_chunk_size
+
+        batch_size = action_embedding.shape[0]
+
+        # 1. 构造 Input [Latent, Placeholders]
+        # 注意：这里的 action_embedding 应该是 (Batch, 1, Dim)
+        if action_embedding.dim() == 2:
+            action_embedding = action_embedding.unsqueeze(1)
+
+        placeholder_tokens = self.placeholder_tokens[:, :chunk_size, :].expand(batch_size, -1, -1)
+        hidden_states = torch.cat((action_embedding, placeholder_tokens), dim=1)
+
+        # 2. 构造 Mask 和 Pos Embed (与 Forward 一致)
+        dec_seq_len = hidden_states.shape[1]
+        decoder_attention_mask = torch.ones(
+            (batch_size, 1, dec_seq_len, dec_seq_len),
+            device=action_embedding.device,
+            dtype=torch.bool
+        )
+        dec_pos_ids = torch.arange(dec_seq_len, device=action_embedding.device).unsqueeze(0)
+        dec_pos_emb = self.rotary_emb(hidden_states, dec_pos_ids)
+
+        # 3. Decoder Forward
+        for decoder_layer in self.action_decoder:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=decoder_attention_mask,
+                position_embeddings=dec_pos_emb,
+                position_ids=dec_pos_ids,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        reconstructed_actions = self.action_proj_out(hidden_states[:, 1:, :])
+
+        return reconstructed_actions
+
+__all__ = [
+    "ActionPreTrainedModel",
+    "ActionModel",
+]
+
+
+if __name__ == "__main__":
+    config = ActionModelConfig()
+    action_model = ActionModel(config)
+    print(action_model)
+
+    print("Total number of DiT parameters: ",
+        sum(p.numel() for p in action_model.parameters() if p.requires_grad))
+
+    fake_actions = torch.randn(10, 15, 32).to("cuda:7")
+
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 32)).astype(np.float16),  # action_chunk, action_dim
+        "lang": "put the ball on the table",
+        "state": np.random.uniform(-1, 1, size=(1, 32)).astype(np.float16),  # chunk, state_dim
+    }
+
+    batch = [sample, sample]
+    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
+    action_model = action_model.to(device)
+
+    outputs = action_model(batch)
+    print(outputs)
+
+    action_embedding = action_model.encode_actions(fake_actions)
+    print(f"action_embedding: {action_embedding}")
+
+    reconstructed_actions = action_model.decode_actions(action_embedding, chunk_size=15)
+    print(f"reconstructed_actions: {reconstructed_actions.shape}")
diff --git a/code/model/modules/action_model/ActionModel_FM.py b/code/model/modules/action_model/ActionModel_FM.py
new file mode 100644
index 0000000000000000000000000000000000000000..467025488147f237b57fb2b408f7a7ecfc73e7d3
--- /dev/null
+++ b/code/model/modules/action_model/ActionModel_FM.py
@@ -0,0 +1,843 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#  This file is a StarVLA-local variant of the action encoder/decoder.
+#  It keeps the overall structure but replaces the decoder with a
+#  flow-matching based decoder (velocity prediction) and injects timestep
+#  conditioning into RMSNorm (AdaRMSNorm) in the decoder.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+
+import sys
+sys.path.append("/mnt/data/fangyu/code/reward_new")
+
+import math
+from typing import List
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch import nn
+from torch.distributions import Beta
+
+from transformers.processing_utils import Unpack
+from transformers.utils import TransformersKwargs, logging
+
+from starVLA.model.modules.action_model.ActionModel import (
+    Qwen3Attention,
+    Qwen3MLP,
+    Qwen3RMSNorm,
+    Qwen3RotaryEmbedding,
+    ActionPreTrainedModel,
+)
+from starVLA.model.modules.action_model.configuration_actionmodel import ActionModelConfig
+from starVLA.model.tools import FRAMEWORK_REGISTRY
+
+logger = logging.get_logger(__name__)
+
+
+class _GradientReversalFunction(torch.autograd.Function):
+    """
+    Forward: identity. Backward: scale gradient by -lambda (inverse gradient).
+    Used for domain adversarial training so the encoder receives reversed gradient
+    and is encouraged to produce domain-invariant embeddings.
+    """
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, lambda_: float) -> torch.Tensor:
+        ctx.lambda_ = lambda_
+        return x.view_as(x)
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor):
+        return -ctx.lambda_ * grad_output, None
+
+
+def _timestep_embedding(t: torch.Tensor, dim: int, max_period: float = 10000.0) -> torch.Tensor:
+    """
+    Standard sinusoidal timestep embedding.
+    Args:
+        t: (B,) float tensor, typically in [0, 1].
+    Returns:
+        (B, dim)
+    """
+    if t.ndim != 1:
+        raise ValueError(f"Expected `t` to have shape (B,), got {tuple(t.shape)}")
+    half = dim // 2
+    freqs = torch.exp(
+        -math.log(max_period) * torch.arange(0, half, device=t.device, dtype=torch.float32) / max(half, 1)
+    )
+    args = t.to(torch.float32)[:, None] * freqs[None]
+    emb = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2 == 1:
+        emb = torch.cat([emb, torch.zeros((emb.shape[0], 1), device=t.device, dtype=emb.dtype)], dim=-1)
+    return emb.to(dtype=t.dtype)
+
+
+class Qwen3AdaRMSNorm(nn.Module):
+    """
+    RMSNorm + timestep conditioning.
+
+    y = RMSNorm(x) * (1 + scale(t)) + shift(t)
+    """
+
+    def __init__(self, hidden_size: int, cond_size: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+        self.cond_mlp = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(cond_size, 2 * hidden_size, bias=True),
+        )
+
+    def forward(self, hidden_states: torch.Tensor, cond: torch.Tensor) -> torch.Tensor:
+        if cond is None:
+            raise ValueError("Qwen3AdaRMSNorm requires `cond` but got None.")
+        if cond.ndim != 2:
+            raise ValueError(f"Expected `cond` to have shape (B, C), got {tuple(cond.shape)}")
+
+        input_dtype = hidden_states.dtype
+        x = hidden_states.to(torch.float32)
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x * torch.rsqrt(variance + self.variance_epsilon)
+        x = self.weight * x.to(input_dtype)
+
+        scale, shift = self.cond_mlp(cond).chunk(2, dim=-1)
+        return x * (1 + scale[:, None, :]) + shift[:, None, :]
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class Qwen3LayerFM(nn.Module):
+    """
+    Same block structure as `Qwen3Layer`, but decoder-side RMSNorms are timestep-conditioned.
+    Attention/MLP are unchanged.
+    """
+
+    def __init__(self, config: ActionModelConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Qwen3Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Qwen3MLP(config)
+        self.input_layernorm = Qwen3AdaRMSNorm(config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = Qwen3AdaRMSNorm(
+            config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        position_ids: torch.LongTensor | None = None,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states, temb)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states, temb)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class ActionModelFM(ActionPreTrainedModel):
+    """
+    Flow-matching based decoder variant for StarVLA `ActionModel`.
+    Encoder stays the same; decoder predicts velocity under linear interpolation noise.
+    """
+
+    def __init__(self, config: ActionModelConfig):
+        super().__init__(config)
+        self.config = config
+
+        # ===== tokens / embeddings (same as original) =====
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.action_mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.state_mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.dataset_embed = nn.Embedding(
+            config.dataset_vocab_size,
+            config.hidden_size * config.num_data_tokens,
+        )
+
+        self.action_proj_in = nn.Linear(config.action_size, config.hidden_size)
+        self.state_proj_in = nn.Linear(config.state_size, config.hidden_size)
+        self.use_state = config.use_state
+        print(f"use_state: {self.use_state}")
+
+        # ===== encoder (unchanged blocks) =====
+        # Reuse the original Qwen3Layer implementation from ActionModel.py through `ActionPreTrainedModel` machinery
+        from starVLA.model.modules.action_model.ActionModel import Qwen3Layer  # local import
+        self.action_encoder = nn.ModuleList([Qwen3Layer(config, layer_idx) for layer_idx in range(config.num_encoder_layers)])
+
+        # ===== decoder (FM) =====
+        self.action_decoder = nn.ModuleList([Qwen3LayerFM(config, layer_idx) for layer_idx in range(config.num_decoder_layers)])
+        self.norm = Qwen3AdaRMSNorm(config.hidden_size, cond_size=config.hidden_size, eps=config.rms_norm_eps)
+        self.action_proj_out = nn.Linear(config.hidden_size, config.action_size)
+
+        self.rotary_emb = Qwen3RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # ===== FM hyperparams =====
+        self.fm_time_min = float(getattr(config, "fm_time_min", 0.001))
+        self.fm_time_max = float(getattr(config, "fm_time_max", 0.999))
+        self.fm_num_inference_steps = int(getattr(config, "fm_num_inference_steps", 10))
+        self.fm_time_sampling = str(getattr(config, "fm_time_sampling", "uniform"))  # "uniform" | "beta"
+        self.fm_beta_alpha = float(getattr(config, "fm_beta_alpha", 1.5))
+        self.fm_beta_beta = float(getattr(config, "fm_beta_beta", 1.0))
+        self._beta_dist = Beta(self.fm_beta_alpha, self.fm_beta_beta)
+
+        # timestep -> temb (B,H)
+        self.fm_timestep_mlp = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 4, bias=True),
+            nn.SiLU(),
+            nn.Linear(config.hidden_size * 4, config.hidden_size, bias=True),
+        )
+
+        # ===== Loss mode: masked-action recon =====
+        self.use_masked_action_recon = bool(getattr(config, "use_masked_action_recon", False))
+        self.post_init()
+
+        self._maybe_init_from_qwen3()
+
+
+    def _sample_fm_time(self, batch_size: int, device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+        if self.fm_time_sampling == "beta":
+            t = self._beta_dist.sample([batch_size]).to(device=device, dtype=dtype)
+        else:
+            t = torch.rand((batch_size,), device=device, dtype=dtype)
+        t = t * (self.fm_time_max - self.fm_time_min) + self.fm_time_min
+        return t
+
+    def _fm_temb(self, t: torch.Tensor) -> torch.Tensor:
+        return self.fm_timestep_mlp(_timestep_embedding(t, self.config.hidden_size))
+
+    def _gather_embeddings(self, x: torch.Tensor) -> tuple[torch.Tensor, int]:
+        """
+        Gather embeddings from all ranks.
+        Returns (gathered_tensor, offset) where offset is the start index of this rank's data in the global batch.
+        Single-GPU: returns (x, 0).
+        """
+        if not (self.contrastive_use_distributed and dist.is_initialized() and dist.get_world_size() > 1):
+            return x, 0
+        world_size = dist.get_world_size()
+        local_size = x.shape[0]
+        size_list = [torch.tensor([0], dtype=torch.long, device=x.device) for _ in range(world_size)]
+        dist.all_gather(size_list, torch.tensor([local_size], dtype=torch.long, device=x.device))
+        sizes = [s.item() for s in size_list]
+        max_size = max(sizes)
+        offset = sum(sizes[: dist.get_rank()])
+        if max_size > local_size:
+            padding = torch.zeros(max_size - local_size, x.shape[1], device=x.device, dtype=x.dtype)
+            x = torch.cat([x, padding], dim=0)
+        gather_list = [torch.zeros_like(x) for _ in range(world_size)]
+        dist.all_gather(gather_list, x)
+        out = torch.cat([g[: sizes[i]] for i, g in enumerate(gather_list)], dim=0)
+        return out, offset
+
+    def random_masking(self, x: torch.Tensor, mask_ratio: float | torch.Tensor):
+        """
+        MAE-style per-sample random masking by shuffling (argsort noise).
+
+        This version DOES NOT drop tokens; it returns `x_masked` with the same shape as `x`,
+        where masked positions are replaced by `self.action_mask_token`.
+
+        Args:
+            x: [N, L, D]
+            mask_ratio: float in [0, 1) OR tensor of shape [N] with per-sample ratios
+
+        Returns:
+            x_masked: [N, L, D]
+            mask: [N, L] (0=keep, 1=mask)
+            ids_restore: [N, L]
+        """
+        N, L, D = x.shape
+        token_dim = int(self.action_mask_token.shape[-1])
+        if D != token_dim:
+            raise ValueError(
+                f"`random_masking` expects last dim D=={token_dim} (same as action_mask_token), got D=={D}."
+            )
+        if isinstance(mask_ratio, torch.Tensor):
+            if mask_ratio.ndim != 1 or mask_ratio.shape[0] != N:
+                raise ValueError(
+                    f"When `mask_ratio` is a tensor it must have shape (N,), got {tuple(mask_ratio.shape)}"
+                )
+            # clamp to safe range
+            mask_ratio = mask_ratio.to(device=x.device, dtype=torch.float32).clamp(min=0.0, max=0.999)
+            len_keep = torch.floor(L * (1.0 - mask_ratio)).to(dtype=torch.long)  # (N,)
+        else:
+            mr = float(mask_ratio)
+            mr = max(0.0, min(0.999, mr))
+            len_keep = int(L * (1.0 - mr))
+
+        noise = torch.rand(N, L, device=x.device)  # noise in [0, 1]
+        ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is mask
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        # generate the binary mask: 0 is keep, 1 is mask
+        mask = torch.ones([N, L], device=x.device, dtype=torch.float32)
+        if isinstance(len_keep, torch.Tensor):
+            # build mask in shuffled order then unshuffle
+            keep = torch.arange(L, device=x.device)[None, :].expand(N, L) < len_keep[:, None]  # (N,L)
+            mask = (~keep).to(dtype=torch.float32)
+        else:
+            mask[:, :len_keep] = 0
+        mask = torch.gather(mask, dim=1, index=ids_restore)  # unshuffle
+
+        # replace masked tokens with action_mask_token (keep sequence length)
+        mask_token = self.action_mask_token.expand(N, L, -1).to(dtype=x.dtype, device=x.device)
+        x_masked = x * (1.0 - mask[:, :, None]) + mask[:, :, None] * mask_token
+
+        return x_masked, mask, ids_restore
+
+    def random_masking_interleaved(
+        self,
+        interleaved: torch.Tensor,
+        mask_ratio: float | torch.Tensor,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        MAE-style random masking for interleaved [state_0, action_0, state_1, action_1, ...].
+        Positions 0, 2, 4, ... are state (replaced with state_mask_token when masked);
+        positions 1, 3, 5, ... are action (replaced with action_mask_token when masked).
+
+        Args:
+            interleaved: [N, 2*L, D] (state, action, state, action, ...)
+            mask_ratio: float in [0, 1) OR tensor [N] per-sample
+
+        Returns:
+            x_masked: [N, 2*L, D]
+            mask: [N, 2*L] (0=keep, 1=mask)
+            ids_restore: [N, 2*L]
+        """
+        N, two_L, D = interleaved.shape
+        L = two_L // 2
+        if isinstance(mask_ratio, torch.Tensor):
+            mask_ratio = mask_ratio.to(device=interleaved.device, dtype=torch.float32).clamp(min=0.0, max=0.999)
+            len_keep = torch.floor(two_L * (1.0 - mask_ratio)).to(dtype=torch.long)
+        else:
+            mr = max(0.0, min(0.999, float(mask_ratio)))
+            len_keep = int(two_L * (1.0 - mr))
+
+        noise = torch.rand(N, two_L, device=interleaved.device)
+        ids_shuffle = torch.argsort(noise, dim=1)
+        ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+        if isinstance(len_keep, torch.Tensor):
+            keep = torch.arange(two_L, device=interleaved.device)[None, :].expand(N, two_L) < len_keep[:, None]
+            mask = (~keep).to(dtype=torch.float32)
+        else:
+            mask = torch.ones(N, two_L, device=interleaved.device, dtype=torch.float32)
+            mask[:, :len_keep] = 0
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+
+        state_mtk = self.state_mask_token.expand(N, two_L, -1).to(dtype=interleaved.dtype, device=interleaved.device)
+        action_mtk = self.action_mask_token.expand(N, two_L, -1).to(dtype=interleaved.dtype, device=interleaved.device)
+        # even indices -> state, odd -> action
+        state_pos = torch.zeros(two_L, device=interleaved.device, dtype=torch.float32)
+        state_pos[0::2] = 1.0
+        state_pos = state_pos.view(1, two_L, 1)
+        action_pos = 1.0 - state_pos
+        mask_expand = mask[:, :, None]
+        replacement = mask_expand * (state_pos * state_mtk + action_pos * action_mtk)
+        x_masked = interleaved * (1.0 - mask_expand) + replacement
+        return x_masked, mask, ids_restore
+
+    # --- copied optional init helper from original ---
+    def _maybe_init_from_qwen3(self) -> None:
+        from transformers import AutoModel
+
+        name_or_path = getattr(self.config, "qwen3_pretrained_name_or_path", None)
+        if not name_or_path:
+            return
+
+        pretrained = AutoModel.from_pretrained(
+            name_or_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+        )
+
+        src_sd = pretrained.state_dict()
+        layer_prefix = None
+        for p in ("model.layers.", "layers."):
+            if any(k.startswith(p) for k in src_sd.keys()):
+                layer_prefix = p
+                break
+
+        norm_prefix = None
+        for p in ("model.norm.", "norm."):
+            if any(k.startswith(p) for k in src_sd.keys()):
+                norm_prefix = p
+                break
+
+        if layer_prefix is None:
+            return
+
+        def _map_layer_key(target_key: str, module_prefix: str, layer_offset: int) -> str | None:
+            rest = target_key[len(module_prefix) + 1 :]
+            parts = rest.split(".", 1)
+            if len(parts) != 2:
+                return None
+            try:
+                tgt_idx = int(parts[0])
+            except ValueError:
+                return None
+            src_idx = tgt_idx + int(layer_offset)
+            return f"{layer_prefix}{src_idx}.{parts[1]}"
+
+        own_sd = self.state_dict()
+        to_load: dict[str, torch.Tensor] = {}
+        matched = 0
+        missing = 0
+        shape_mismatch = 0
+
+        init_enc = bool(getattr(self.config, "qwen3_init_action_encoder", True))
+        init_dec = bool(getattr(self.config, "qwen3_init_action_decoder", True))
+        init_norm = bool(getattr(self.config, "qwen3_init_norm", True))
+        enc_off = int(getattr(self.config, "qwen3_encoder_layer_offset", 0))
+        dec_off = int(getattr(self.config, "qwen3_decoder_layer_offset", 0))
+
+        # NOTE: decoder has AdaRMSNorm (extra cond_mlp weights), but many weights still match:
+        # - action_decoder.*.self_attn.*
+        # - action_decoder.*.mlp.*
+        # - action_decoder.*.(input_layernorm|post_attention_layernorm).weight  (load RMS weight only)
+        # - norm.weight (load RMS weight only)
+        for k, tgt_tensor in own_sd.items():
+            src_key = None
+            if init_enc and k.startswith("action_encoder."):
+                src_key = _map_layer_key(k, "action_encoder", enc_off)
+            elif init_dec and k.startswith("action_decoder."):
+                # Skip timestep-conditioned MLP weights (no counterpart in Qwen3)
+                if ".cond_mlp." in k:
+                    continue
+                src_key = _map_layer_key(k, "action_decoder", dec_off)
+            elif init_norm and k == "norm.weight" and norm_prefix is not None:
+                src_key = f"{norm_prefix}weight"
+
+            if not src_key:
+                continue
+            src_tensor = src_sd.get(src_key, None)
+            if src_tensor is None:
+                missing += 1
+                continue
+            if src_tensor.shape != tgt_tensor.shape:
+                shape_mismatch += 1
+                continue
+
+            to_load[k] = src_tensor.to(device=tgt_tensor.device, dtype=tgt_tensor.dtype)
+            matched += 1
+
+        self.load_state_dict(to_load, strict=False)
+        print(
+            f"Initialized from Qwen3 checkpoint {name_or_path}. "
+            f"matched={matched} missing={missing} shape_mismatch={shape_mismatch} prefix={layer_prefix}"
+        )
+
+        if matched == 0:
+            # Most common culprit: config dims don't match Qwen3 checkpoint.
+            src_cfg = getattr(pretrained, "config", None)
+            if src_cfg is not None:
+                fields = [
+                    "hidden_size",
+                    "intermediate_size",
+                    "num_hidden_layers",
+                    "num_attention_heads",
+                    "num_key_value_heads",
+                    "head_dim",
+                    "rms_norm_eps",
+                ]
+                diffs = []
+                for f in fields:
+                    if hasattr(src_cfg, f) and hasattr(self.config, f):
+                        a = getattr(self.config, f)
+                        b = getattr(src_cfg, f)
+                        if a != b:
+                            diffs.append((f, a, b))
+                if diffs:
+                    print("[ActionModelFM] Qwen3 init got 0 matches. Config differs from checkpoint:")
+                    for f, a, b in diffs:
+                        print(f"  - {f}: ActionModelConfig={a} vs Qwen3={b}")
+
+    def forward(
+        self,
+        examples: List[dict] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        device = next(self.parameters()).device
+        batch_size = len(examples)
+
+        # =========================================================================
+        # 1. Variable-length Horizon (same as original)
+        # =========================================================================
+        raw_actions = torch.tensor(
+            np.array([ex["action"] for ex in examples]),
+            device=device,
+            dtype=torch.float32,
+        )  # [B, L, D]
+
+        use_state = self.use_state
+        raw_states = None
+        if use_state:
+            raw_states = torch.tensor(
+                np.array([ex["state"] for ex in examples]),
+                device=device,
+                dtype=torch.float32,
+            )  # [B, L, state_dim]
+
+        # =========================================================================
+        # 2. Action (and optional State) Input Construction & Masking (DAE)
+        # Encoder sequence: cls, dataset_tokens, [state_0, action_0, state_1, action_1, ...]
+        # Two-view (masked + clean) when use_masked_action_recon.
+        # =========================================================================
+        with torch.autocast("cuda", dtype=torch.float32):
+            clean_action_embeds = self.action_proj_in(raw_actions)  # [B, L, H]
+            if use_state:
+                clean_state_embeds = self.state_proj_in(raw_states)  # [B, L, H]
+                # Interleave: [s0, a0, s1, a1, ...] -> [B, 2*L, H]
+                clean_inputs_embeds = torch.stack(
+                    [clean_state_embeds, clean_action_embeds], dim=2
+                ).reshape(batch_size, 2 * raw_actions.shape[1], -1)
+            else:
+                clean_inputs_embeds = clean_action_embeds
+
+            masked_inputs_embeds = clean_inputs_embeds
+            if self.use_masked_action_recon:
+                if use_state:
+                    if getattr(self.config, "mask_ratio_mode", "fixed") == "uniform_per_traj":
+                        mr_min = float(getattr(self.config, "mask_ratio_min", self.config.mask_ratio))
+                        mr_max = float(getattr(self.config, "mask_ratio_max", self.config.mask_ratio))
+                        per_traj_mr = torch.rand((batch_size,), device=device) * (mr_max - mr_min) + mr_min
+                        masked_inputs_embeds, _, _ = self.random_masking_interleaved(clean_inputs_embeds, per_traj_mr)
+                    else:
+                        masked_inputs_embeds, _, _ = self.random_masking_interleaved(
+                            clean_inputs_embeds, float(self.config.mask_ratio)
+                        )
+                else:
+                    if getattr(self.config, "mask_ratio_mode", "fixed") == "uniform_per_traj":
+                        mr_min = float(getattr(self.config, "mask_ratio_min", self.config.mask_ratio))
+                        mr_max = float(getattr(self.config, "mask_ratio_max", self.config.mask_ratio))
+                        per_traj_mr = torch.rand((batch_size,), device=device) * (mr_max - mr_min) + mr_min
+                        masked_inputs_embeds, _, _ = self.random_masking(clean_inputs_embeds, per_traj_mr)
+                    else:
+                        masked_inputs_embeds, _, _ = self.random_masking(clean_inputs_embeds, float(self.config.mask_ratio))
+
+            # =========================================================================
+            # 3. Dataset Soft Prompt (same as original)
+            # =========================================================================
+            dataset_ids = [ex.get("dataset_id") for ex in examples]
+            dataset_ids_tensor = torch.tensor(dataset_ids, device=device, dtype=torch.long)
+            ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+                batch_size, self.config.num_data_tokens, self.config.hidden_size
+            )
+
+            cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)
+            encoder_inputs_clean = torch.cat((cls_token_expanded, ds_embeds, clean_inputs_embeds), dim=1)
+            encoder_inputs_masked = torch.cat((cls_token_expanded, ds_embeds, masked_inputs_embeds), dim=1)
+
+            seq_len = encoder_inputs_clean.shape[1]
+            enc_bs = batch_size * 2 if self.use_masked_action_recon else batch_size
+            encoder_attention_mask = torch.ones((enc_bs, 1, seq_len, seq_len), device=device, dtype=torch.bool)
+            encoder_pos_ids = torch.arange(seq_len, device=device).unsqueeze(0)
+            # rotary embeddings are position-based; we keep position_ids batch=1 and broadcast.
+            enc_pos_emb = self.rotary_emb(encoder_inputs_clean, encoder_pos_ids)
+
+            hidden_states = (
+                torch.cat((encoder_inputs_masked, encoder_inputs_clean), dim=0)
+                if self.use_masked_action_recon
+                else encoder_inputs_clean
+            )
+            for encoder_layer in self.action_encoder:
+                hidden_states = encoder_layer(
+                    hidden_states,
+                    attention_mask=encoder_attention_mask,
+                    position_embeddings=enc_pos_emb,
+                    position_ids=encoder_pos_ids,
+                    **kwargs,
+                )
+
+            if self.use_masked_action_recon:
+                hidden_masked, hidden_clean = hidden_states.chunk(2, dim=0)
+                action_embedding_masked = F.normalize(hidden_masked[:, :1, :], p=2, dim=-1)
+                action_embedding_clean = F.normalize(hidden_clean[:, :1, :], p=2, dim=-1)
+            else:
+                action_embedding_clean = F.normalize(hidden_states[:, :1, :], p=2, dim=-1)
+                action_embedding_masked = None
+
+            # =========================================================================
+            # 4. Flow-matching Decoder
+            # =========================================================================
+            t = self._sample_fm_time(batch_size, device=device, dtype=raw_actions.dtype)  # (B,)
+            noise = torch.randn_like(raw_actions)
+            noisy_actions = t[:, None, None] * noise + (1 - t[:, None, None]) * raw_actions
+            target_velocity = noise - raw_actions
+
+            noisy_embeds = self.action_proj_in(noisy_actions)
+            if self.use_masked_action_recon:
+                # Single decoder forward for both views in one batch.
+                decoder_cond = torch.cat((action_embedding_clean, action_embedding_masked), dim=0)
+                noisy_embeds = torch.cat((noisy_embeds, noisy_embeds), dim=0)
+                t = torch.cat((t, t), dim=0)
+                target_velocity = torch.cat((target_velocity, target_velocity), dim=0)
+            else:
+                decoder_cond = action_embedding_clean
+
+            decoder_inputs = torch.cat((decoder_cond, noisy_embeds), dim=1)  # [B or 2B, 1+L, H]
+
+            dec_seq_len = decoder_inputs.shape[1]
+            dec_bs = decoder_inputs.shape[0]
+            decoder_attention_mask = torch.ones((dec_bs, 1, dec_seq_len, dec_seq_len), device=device, dtype=torch.bool)
+            dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
+            dec_pos_emb = self.rotary_emb(decoder_inputs, dec_pos_ids)
+            temb = self._fm_temb(t)
+
+            hidden_states = decoder_inputs
+            for decoder_layer in self.action_decoder:
+                hidden_states = decoder_layer(
+                    hidden_states,
+                    temb=temb,
+                    attention_mask=decoder_attention_mask,
+                    position_embeddings=dec_pos_emb,
+                    position_ids=dec_pos_ids,
+                )
+
+            hidden_states = self.norm(hidden_states, temb)
+            pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])
+
+            if self.use_masked_action_recon:
+                pred_clean, pred_masked = pred_velocity.chunk(2, dim=0)
+                target_clean, target_masked = target_velocity.chunk(2, dim=0)
+                recon_loss_clean = F.mse_loss(pred_clean, target_clean)
+                recon_loss_masked = F.mse_loss(pred_masked, target_masked)
+                recon_loss = 0.5 * (recon_loss_clean + recon_loss_masked)
+            else:
+                recon_loss = F.mse_loss(pred_velocity, target_velocity)
+            return recon_loss
+
+    def recon_loss(self, actions, dataset_ids: list[int], state=None, **kwargs):
+        """
+        Same interface as `ActionModel.recon_loss`, but using flow-matching decoder loss.
+
+        Args:
+            actions: (B, L, action_dim)
+            dataset_ids: list[int]; used for dataset soft prompt when state is provided.
+            state: optional (B, L, state_dim); if provided and state_proj_in exists,
+                   encoder sees interleaved sequence [state_0, action_0, state_1, action_1, ...].
+        Returns:
+            scalar loss
+        """
+        # Optional fast-path: pass a precomputed action embedding to avoid another encoder forward.
+        action_embedding = kwargs.pop("action_embedding", None)
+        t = kwargs.pop("t", None)
+        noise = kwargs.pop("noise", None)
+
+        if action_embedding is None:
+            action_embedding = self.encode_actions(actions, dataset_ids, state, **kwargs)
+
+        return self.recon_loss_from_embedding(
+            action_embedding=action_embedding,
+            actions=actions,
+            t=t,
+            noise=noise,
+        )
+
+    def recon_loss_from_embedding(
+        self,
+        action_embedding: torch.Tensor,
+        actions: torch.Tensor,
+        t: torch.Tensor | None = None,
+        noise: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        """
+        Flow-matching velocity loss conditioned on a provided action embedding.
+
+        This is the preferred interface when you already have an action embedding (e.g., from VLM projector),
+        since it avoids an extra action-encoder forward.
+
+        Args:
+            action_embedding: (B, H) or (B, 1, H), assumed L2-normalized (recommended).
+            actions: (B, L, action_dim)
+            t: optional (B,) time; if None sample internally
+            noise: optional (B, L, action_dim) noise; if None sample internally
+        """
+        if action_embedding.dim() == 2:
+            action_embedding = action_embedding.unsqueeze(1)
+        if action_embedding.dim() != 3 or action_embedding.shape[1] != 1:
+            raise ValueError(f"Expected action_embedding shape (B,1,H) or (B,H); got {tuple(action_embedding.shape)}")
+
+        batch_size = actions.shape[0]
+        device = actions.device
+        dtype = actions.dtype
+
+        if t is None:
+            t = self._sample_fm_time(batch_size, device=device, dtype=dtype)
+        if noise is None:
+            noise = torch.randn_like(actions)
+
+        noisy_actions = t[:, None, None] * noise + (1 - t[:, None, None]) * actions
+        target_velocity = noise - actions
+
+        temb = self._fm_temb(t)
+        action_embeds = self.action_proj_in(noisy_actions)
+        hidden_states = torch.cat((action_embedding, action_embeds), dim=1)
+
+        dec_seq_len = hidden_states.shape[1]
+        decoder_attention_mask = torch.ones(
+            (batch_size, 1, dec_seq_len, dec_seq_len),
+            device=device,
+            dtype=torch.bool,
+        )
+        dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
+        dec_pos_emb = self.rotary_emb(hidden_states, dec_pos_ids)
+
+        for decoder_layer in self.action_decoder:
+            hidden_states = decoder_layer(
+                hidden_states,
+                temb=temb,
+                attention_mask=decoder_attention_mask,
+                position_embeddings=dec_pos_emb,
+                position_ids=dec_pos_ids,
+            )
+
+        hidden_states = self.norm(hidden_states, temb)
+        pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])
+        return F.mse_loss(pred_velocity, target_velocity)
+
+    def encode_actions(self, actions, dataset_ids: list[int], state=None, **kwargs):
+        """
+        Encode action chunk (and optional state chunk) to a single CLS embedding.
+
+        Args:
+            actions: (B, L, action_dim)
+            state: optional (B, L, state_dim); if provided and state_proj_in exists,
+                   encoder sees interleaved sequence [state_0, action_0, state_1, action_1, ...].
+            dataset_ids: list[int]; used for dataset soft prompt when state is provided.
+        """
+        action_embeds = self.action_proj_in(actions)
+        batch_size = action_embeds.shape[0]
+        use_state = state is not None and self.state_proj_in is not None
+        if use_state:
+            state_embeds = self.state_proj_in(state)
+            L = action_embeds.shape[1]
+            inputs_embeds = torch.stack(
+                [state_embeds, action_embeds], dim=2
+            ).reshape(batch_size, 2 * L, -1)
+        else:
+            inputs_embeds = action_embeds
+
+        cls_token_expanded = self.cls_token.expand(batch_size, -1, -1)
+
+        dataset_ids_tensor = torch.tensor(dataset_ids, device=action_embeds.device, dtype=torch.long)
+        ds_embeds = self.dataset_embed(dataset_ids_tensor).view(
+            batch_size, self.config.num_data_tokens, self.config.hidden_size
+        )
+        inputs_embeds = torch.cat((cls_token_expanded, ds_embeds, inputs_embeds), dim=1)
+
+        seq_len = inputs_embeds.shape[1]
+        encoder_attention_mask = torch.ones(
+            (batch_size, 1, seq_len, seq_len),
+            device=inputs_embeds.device,
+            dtype=torch.bool,
+        )
+        encoder_pos_ids = torch.arange(seq_len, device=inputs_embeds.device).unsqueeze(0)
+        enc_pos_emb = self.rotary_emb(inputs_embeds, encoder_pos_ids)
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.action_encoder:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=encoder_attention_mask,
+                position_embeddings=enc_pos_emb,
+                position_ids=encoder_pos_ids,
+                **kwargs,
+            )
+
+        action_embedding = hidden_states[:, :1, :]
+        return F.normalize(action_embedding, p=2, dim=-1)
+
+    @torch.no_grad()
+    def decode_actions(self, action_embedding, chunk_size, **kwargs):
+        """
+        FM sampling via simple Euler integration of the learned velocity field.
+        """
+        if chunk_size is None:
+            chunk_size = self.config.max_action_chunk_size
+
+        if action_embedding.dim() == 2:
+            action_embedding = action_embedding.unsqueeze(1)
+
+        batch_size = action_embedding.shape[0]
+        device = action_embedding.device
+        dtype = action_embedding.dtype
+
+        actions = torch.randn((batch_size, chunk_size, self.config.action_size), device=device, dtype=dtype)
+        num_steps = max(int(self.fm_num_inference_steps), 1)
+        dt = -1.0 / float(num_steps)
+
+        for step in range(num_steps):
+            t = torch.full((batch_size,), 1.0 - step / float(num_steps), device=device, dtype=dtype)
+            temb = self._fm_temb(t)
+
+            action_embeds = self.action_proj_in(actions)
+            hidden_states = torch.cat((action_embedding, action_embeds), dim=1)
+
+            dec_seq_len = hidden_states.shape[1]
+            decoder_attention_mask = torch.ones((batch_size, 1, dec_seq_len, dec_seq_len), device=device, dtype=torch.bool)
+            dec_pos_ids = torch.arange(dec_seq_len, device=device).unsqueeze(0)
+            dec_pos_emb = self.rotary_emb(hidden_states, dec_pos_ids)
+
+            for decoder_layer in self.action_decoder:
+                hidden_states = decoder_layer(
+                    hidden_states,
+                    temb=temb,
+                    attention_mask=decoder_attention_mask,
+                    position_embeddings=dec_pos_emb,
+                    position_ids=dec_pos_ids,
+                )
+
+            hidden_states = self.norm(hidden_states, temb)
+            pred_velocity = self.action_proj_out(hidden_states[:, 1:, :])
+            actions = actions + dt * pred_velocity
+
+        return actions
+
+
+__all__ = [
+    "ActionModelFM",
+]
+
+if __name__ == "__main__":
+    config = ActionModelConfig()
+    action_model = ActionModelFM(config)
+    print(action_model)
+
+    print("Total number of DiT parameters: ",
+        sum(p.numel() for p in action_model.parameters() if p.requires_grad))
+
+    fake_actions = torch.randn(10, 15, 64).to("cuda:7")
+
+    sample = {
+        "action": np.random.uniform(-1, 1, size=(16, 29)).astype(np.float16),  # action_chunk, action_dim (unified 29D)
+        "lang": "put the ball on the table",
+    }
+
+    batch = [sample, sample]
+    device = torch.device("cuda:7" if torch.cuda.is_available() else "cpu")
+    action_model = action_model.to(device)
+
+    outputs = action_model(batch)
+    print(outputs)
+
+    action_embedding = action_model.encode_actions(fake_actions)
+    print(f"action_embedding: {action_embedding.shape}")
+
+    reconstructed_actions = action_model.decode_actions(action_embedding, chunk_size=15)
+    print(f"reconstructed_actions: {reconstructed_actions.shape}")
diff --git a/code/model/modules/action_model/DiTActionHeader.py b/code/model/modules/action_model/DiTActionHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..788ef80d23af3fe476b553058ac53d8a01fc1204
--- /dev/null
+++ b/code/model/modules/action_model/DiTActionHeader.py
@@ -0,0 +1,221 @@
+# Copyright 2025 CogACT. All rights reserved.
+# Modified by [Jinhui YE/ HKUST University] in [2025]. 
+# Modification: [add global config ].
+"""
+Diffusion-based action prediction head (DiT variant).
+
+Provides:
+  - Size presets (S/B/L) for transformer-based temporal action diffusion backbone
+  - ActionModel: wraps diffusion process (training + optional DDIM sampling creation)
+"""
+
+from starVLA.model.modules.action_model.DiT_modules.models import DiT
+from starVLA.model.modules.action_model import create_diffusion
+from .DiT_modules import gaussian_diffusion as gd
+
+import torch
+from torch import nn
+
+
+# Create model sizes of ActionModels
+def DiT_S(**kwargs):  # TODO move to config for reproducibility
+    """
+    Small DiT variant.
+
+    Args:
+        **kwargs: Passed through to DiT constructor.
+
+    Returns:
+        DiT: Initialized small model.
+    """
+    return DiT(depth=6, token_size=384, num_heads=4, **kwargs)
+
+
+def DiT_B(**kwargs):
+    """
+    Base DiT variant.
+
+    Args:
+        **kwargs: Passed through to DiT constructor.
+
+    Returns:
+        DiT: Initialized base model.
+    """
+    return DiT(depth=12, token_size=768, num_heads=12, **kwargs)
+
+
+def DiT_L(**kwargs):
+    """
+    Large DiT variant.
+
+    Args:
+        **kwargs: Passed through to DiT constructor.
+
+    Returns:
+        DiT: Initialized large model.
+    """
+    return DiT(depth=24, token_size=1024, num_heads=16, **kwargs)
+
+
+# Model size
+DiT_models = {"DiT-S": DiT_S, "DiT-B": DiT_B, "DiT-L": DiT_L}
+
+
+# Create ActionModel
+class ActionModel(nn.Module):
+    """
+    Diffusion temporal action head.
+
+    Components:
+        - DiT transformer backbone (token-wise denoiser)
+        - Gaussian diffusion scheduler (noise forward/backward)
+        - Optional DDIM sampler (created lazily)
+
+    Responsibilities:
+        - Forward: add noise + predict denoised residual
+        - loss(): simple MSE on noise prediction
+        - create_ddim(): build deterministic sampler
+    """
+
+    def __init__(
+        self,
+        action_hidden_dim,
+        model_type,
+        in_channels,
+        future_action_window_size,
+        past_action_window_size,
+        diffusion_steps=100,
+        noise_schedule="squaredcos_cap_v2",
+    ):
+        """
+        Initialize diffusion model and backbone.
+
+        Args:
+            action_hidden_dim: Hidden size of conditioning tokens (QFormer output dim).
+            model_type: One of {'DiT-S','DiT-B','DiT-L'}.
+            in_channels: Action dimensionality (per timestep).
+            future_action_window_size: Number of future steps modeled.
+            past_action_window_size: Number of past steps possibly encoded (for context).
+            diffusion_steps: Total diffusion timesteps.
+            noise_schedule: Scheduler type string.
+        """
+        super().__init__()
+        self.in_channels = in_channels
+        self.noise_schedule = noise_schedule
+        # GaussianDiffusion offers forward and backward functions q_sample and p_sample.
+        self.diffusion_steps = diffusion_steps
+        self.diffusion = create_diffusion(
+            timestep_respacing="",
+            noise_schedule=noise_schedule,
+            diffusion_steps=self.diffusion_steps,
+            sigma_small=True,
+            learn_sigma=False,
+        )
+        self.ddim_diffusion = None
+        if self.diffusion.model_var_type in [gd.ModelVarType.LEARNED, gd.ModelVarType.LEARNED_RANGE]:
+            learn_sigma = True
+        else:
+            learn_sigma = False
+        self.past_action_window_size = past_action_window_size
+        self.future_action_window_size = future_action_window_size
+        self.token_size = action_hidden_dim  # QFormer output size
+        self.net = DiT_models[model_type](
+            in_channels=in_channels,
+            class_dropout_prob=0.1,
+            learn_sigma=learn_sigma,
+            future_action_window_size=future_action_window_size,
+            past_action_window_size=past_action_window_size,
+        )
+
+    def forward(self, gt_action, condition, **kwargs):
+        """
+        Perform one diffusion training step.
+
+        Args:
+            gt_action: Ground truth action tensor [B, T, C].
+            condition: Conditioning tokens [B, L, D].
+            **kwargs: Ignored (reserved).
+
+        Returns:
+            tuple:
+                noise_pred: Predicted noise tensor.
+                noise: Sampled noise tensor.
+                timestep: Timesteps used per batch element.
+        """
+        # sample random noise and timestep
+        noise = torch.randn_like(gt_action)  # [B, T, C]
+        timestep = torch.randint(0, self.diffusion.num_timesteps, (gt_action.size(0),), device=gt_action.device)
+
+        # sample x_t from x
+        x_t = self.diffusion.q_sample(gt_action, timestep, noise)
+
+        # predict noise from x_t
+        noise_pred = self.net(x_t, timestep, condition)
+
+        assert noise_pred.shape == noise.shape == gt_action.shape
+
+        return noise_pred, noise, timestep
+
+    def loss(self, noise_pred, noise):
+        """
+        Compute MSE noise prediction loss.
+
+        Args:
+            noise_pred: Predicted noise tensor.
+            noise: Target noise tensor.
+
+        Returns:
+            torch.Tensor: Scalar loss.
+        """
+        # Compute L2 loss
+        loss = ((noise_pred - noise) ** 2).mean()
+        # Optional: loss += loss_vlb
+
+        return loss
+
+    def create_ddim(self, ddim_step=10):
+        """
+        Lazily create DDIM sampler instance.
+
+        Args:
+            ddim_step: Number of DDIM steps.
+
+        Returns:
+            Diffusion: DDIM diffusion object.
+        """
+        self.ddim_diffusion = create_diffusion(
+            timestep_respacing="ddim" + str(ddim_step),
+            noise_schedule=self.noise_schedule,
+            diffusion_steps=self.diffusion_steps,
+            sigma_small=True,
+            learn_sigma=False,
+        )
+        return self.ddim_diffusion
+
+
+def get_action_model(model_typ="DiT-B", config=None):
+    """
+    Factory: build ActionModel from global framework config.
+
+    Args:
+        model_typ: (Unused override; model type inferred from config).
+        config: Global config (expects config.framework.action_model namespace).
+
+    Returns:
+        ActionModel: Initialized diffusion action head.
+    """
+    action_model_cfg = config.framework.action_model
+
+    model_type = action_model_cfg.action_model_type
+    action_hidden_dim = action_model_cfg.action_hidden_dim
+    action_dim = action_model_cfg.action_dim
+    future_action_window_size = action_model_cfg.future_action_window_size
+    past_action_window_size = action_model_cfg.past_action_window_size
+
+    return ActionModel(
+        model_type=model_type,  # Model type, e.g., 'DiT-B'
+        action_hidden_dim=action_hidden_dim,  # Hidden size of action tokens
+        in_channels=action_dim,  # Input channel size
+        future_action_window_size=future_action_window_size,  # Future action window size
+        past_action_window_size=past_action_window_size,  # Past action window size
+    )
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-310.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df09d4c75863c54ea3e66113370852e18069f009
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-311.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae89f04b72420b5378b533237ab23f609bf620ad
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/diffusion_utils.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-310.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da31d7eda43438c8c48869e6fea78d5a09b5e341
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-311.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f1ea51e39cb1452e254cc64241c48f382e8b57f
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/gaussian_diffusion.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-310.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13a2cd69623e1e8c0496aab40bb24bddc47cf5ae
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-311.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..021974d8bedcb693601a243849c635e856a40bc9
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/models.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-310.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4c5c0047488b2a7c6cf148396b0bce1cded39f0a
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-311.pyc b/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..46ab70183a87c57d43bc592002c92e49d12a2882
Binary files /dev/null and b/code/model/modules/action_model/DiT_modules/__pycache__/respace.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/DiT_modules/diffusion_utils.py b/code/model/modules/action_model/DiT_modules/diffusion_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..919632e8616e714e610bf7a3081ee173db3c886b
--- /dev/null
+++ b/code/model/modules/action_model/DiT_modules/diffusion_utils.py
@@ -0,0 +1,79 @@
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+import torch as th
+import numpy as np
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, th.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for th.exp().
+    logvar1, logvar2 = [x if isinstance(x, th.Tensor) else th.tensor(x).to(tensor) for x in (logvar1, logvar2)]
+
+    return 0.5 * (-1.0 + logvar2 - logvar1 + th.exp(logvar1 - logvar2) + ((mean1 - mean2) ** 2) * th.exp(-logvar2))
+
+
+def approx_standard_normal_cdf(x):
+    """
+    A fast approximation of the cumulative distribution function of the
+    standard normal.
+    """
+    return 0.5 * (1.0 + th.tanh(np.sqrt(2.0 / np.pi) * (x + 0.044715 * th.pow(x, 3))))
+
+
+def continuous_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a continuous Gaussian distribution.
+    :param x: the targets
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    normalized_x = centered_x * inv_stdv
+    log_probs = th.distributions.Normal(th.zeros_like(x), th.ones_like(x)).log_prob(normalized_x)
+    return log_probs
+
+
+def discretized_gaussian_log_likelihood(x, *, means, log_scales):
+    """
+    Compute the log-likelihood of a Gaussian distribution discretizing to a
+    given image.
+    :param x: the target images. It is assumed that this was uint8 values,
+              rescaled to the range [-1, 1].
+    :param means: the Gaussian mean Tensor.
+    :param log_scales: the Gaussian log stddev Tensor.
+    :return: a tensor like x of log probabilities (in nats).
+    """
+    assert x.shape == means.shape == log_scales.shape
+    centered_x = x - means
+    inv_stdv = th.exp(-log_scales)
+    plus_in = inv_stdv * (centered_x + 1.0 / 255.0)
+    cdf_plus = approx_standard_normal_cdf(plus_in)
+    min_in = inv_stdv * (centered_x - 1.0 / 255.0)
+    cdf_min = approx_standard_normal_cdf(min_in)
+    log_cdf_plus = th.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = th.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = th.where(
+        x < -0.999,
+        log_cdf_plus,
+        th.where(x > 0.999, log_one_minus_cdf_min, th.log(cdf_delta.clamp(min=1e-12))),
+    )
+    assert log_probs.shape == x.shape
+    return log_probs
diff --git a/code/model/modules/action_model/DiT_modules/gaussian_diffusion.py b/code/model/modules/action_model/DiT_modules/gaussian_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b41e9651626b90ce0e38cbef910d09043eb2e4
--- /dev/null
+++ b/code/model/modules/action_model/DiT_modules/gaussian_diffusion.py
@@ -0,0 +1,840 @@
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+
+import math
+
+import numpy as np
+import torch as th
+import enum
+
+from .diffusion_utils import discretized_gaussian_log_likelihood, normal_kl
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+class ModelMeanType(enum.Enum):
+    """
+    Which type of output the model predicts.
+    """
+
+    PREVIOUS_X = enum.auto()  # the model predicts x_{t-1}
+    START_X = enum.auto()  # the model predicts x_0
+    EPSILON = enum.auto()  # the model predicts epsilon
+
+
+class ModelVarType(enum.Enum):
+    """
+    What is used as the model's output variance.
+    The LEARNED_RANGE option has been added to allow the model to predict
+    values between FIXED_SMALL and FIXED_LARGE, making its job easier.
+    """
+
+    LEARNED = enum.auto()
+    FIXED_SMALL = enum.auto()
+    FIXED_LARGE = enum.auto()
+    LEARNED_RANGE = enum.auto()
+
+
+class LossType(enum.Enum):
+    MSE = enum.auto()  # use raw MSE loss (and KL when learning variances)
+    RESCALED_MSE = enum.auto()  # use raw MSE loss (with RESCALED_KL when learning variances)
+    KL = enum.auto()  # use the variational lower-bound
+    RESCALED_KL = enum.auto()  # like KL, but rescale to estimate the full VLB
+
+    def is_vb(self):
+        return self == LossType.KL or self == LossType.RESCALED_KL
+
+
+def _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, warmup_frac):
+    betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    warmup_time = int(num_diffusion_timesteps * warmup_frac)
+    betas[:warmup_time] = np.linspace(beta_start, beta_end, warmup_time, dtype=np.float64)
+    return betas
+
+
+def get_beta_schedule(beta_schedule, *, beta_start, beta_end, num_diffusion_timesteps):
+    """
+    This is the deprecated API for creating beta schedules.
+    See get_named_beta_schedule() for the new library of schedules.
+    """
+    if beta_schedule == "quad":
+        betas = (
+            np.linspace(
+                beta_start**0.5,
+                beta_end**0.5,
+                num_diffusion_timesteps,
+                dtype=np.float64,
+            )
+            ** 2
+        )
+    elif beta_schedule == "linear":
+        betas = np.linspace(beta_start, beta_end, num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "warmup10":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.1)
+    elif beta_schedule == "warmup50":
+        betas = _warmup_beta(beta_start, beta_end, num_diffusion_timesteps, 0.5)
+    elif beta_schedule == "const":
+        betas = beta_end * np.ones(num_diffusion_timesteps, dtype=np.float64)
+    elif beta_schedule == "jsd":  # 1/T, 1/(T-1), 1/(T-2), ..., 1
+        betas = 1.0 / np.linspace(num_diffusion_timesteps, 1, num_diffusion_timesteps, dtype=np.float64)
+    else:
+        raise NotImplementedError(beta_schedule)
+    assert betas.shape == (num_diffusion_timesteps,)
+    return betas
+
+
+def get_named_beta_schedule(schedule_name, num_diffusion_timesteps):
+    """
+    Get a pre-defined beta schedule for the given name.
+    The beta schedule library consists of beta schedules which remain similar
+    in the limit of num_diffusion_timesteps.
+    Beta schedules may be added, but should not be removed or changed once
+    they are committed to maintain backwards compatibility.
+    """
+    if schedule_name == "linear":
+        # Linear schedule from Ho et al, extended to work for any number of
+        # diffusion steps.
+        scale = 1000 / num_diffusion_timesteps
+        return get_beta_schedule(
+            "linear",
+            beta_start=scale * 0.0001,
+            beta_end=scale * 0.02,
+            num_diffusion_timesteps=num_diffusion_timesteps,
+        )
+    elif schedule_name == "squaredcos_cap_v2":
+        return betas_for_alpha_bar(
+            num_diffusion_timesteps,
+            lambda t: math.cos((t + 0.008) / 1.008 * math.pi / 2) ** 2,
+        )
+    else:
+        raise NotImplementedError(f"unknown beta schedule: {schedule_name}")
+
+
+def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
+    """
+    Create a beta schedule that discretizes the given alpha_t_bar function,
+    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    :param num_diffusion_timesteps: the number of betas to produce.
+    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+                      produces the cumulative product of (1-beta) up to that
+                      part of the diffusion process.
+    :param max_beta: the maximum beta to use; use values lower than 1 to
+                     prevent singularities.
+    """
+    betas = []
+    for i in range(num_diffusion_timesteps):
+        t1 = i / num_diffusion_timesteps
+        t2 = (i + 1) / num_diffusion_timesteps
+        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
+    return np.array(betas)
+
+
+class GaussianDiffusion:
+    """
+    Utilities for training and sampling diffusion models.
+    Original ported from this codebase:
+    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/diffusion_utils_2.py#L42
+    :param betas: a 1-D numpy array of betas for each diffusion timestep,
+                  starting at T and going to 1.
+    """
+
+    def __init__(self, *, betas, model_mean_type, model_var_type, loss_type):
+
+        self.model_mean_type = model_mean_type
+        self.model_var_type = model_var_type
+        self.loss_type = loss_type
+
+        # Use float64 for accuracy.
+        betas = np.array(betas, dtype=np.float64)
+        self.betas = betas
+        assert len(betas.shape) == 1, "betas must be 1-D"
+        assert (betas > 0).all() and (betas <= 1).all()
+
+        self.num_timesteps = int(betas.shape[0])
+
+        alphas = 1.0 - betas
+        self.alphas_cumprod = np.cumprod(alphas, axis=0)
+        self.alphas_cumprod_prev = np.append(1.0, self.alphas_cumprod[:-1])
+        self.alphas_cumprod_next = np.append(self.alphas_cumprod[1:], 0.0)
+        assert self.alphas_cumprod_prev.shape == (self.num_timesteps,)
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = np.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = np.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = np.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = np.sqrt(1.0 / self.alphas_cumprod - 1)
+
+        # calculations for posterior q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain
+        self.posterior_log_variance_clipped = (
+            np.log(np.append(self.posterior_variance[1], self.posterior_variance[1:]))
+            if len(self.posterior_variance) > 1
+            else np.array([])
+        )
+
+        self.posterior_mean_coef1 = betas * np.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+
+    def q_mean_variance(self, x_start, t):
+        """
+        Get the distribution q(x_t | x_0).
+        :param x_start: the [N x C x ...] tensor of noiseless inputs.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :return: A tuple (mean, variance, log_variance), all of x_start's shape.
+        """
+        mean = _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+        variance = _extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape)
+        log_variance = _extract_into_tensor(self.log_one_minus_alphas_cumprod, t, x_start.shape)
+        return mean, variance, log_variance
+
+    def q_sample(self, x_start, t, noise=None):
+        """
+        Diffuse the data for a given number of diffusion steps.
+        In other words, sample from q(x_t | x_0).
+        :param x_start: the initial data batch.
+        :param t: the number of diffusion steps (minus 1). Here, 0 means one step.
+        :param noise: if specified, the split-out normal noise.
+        :return: A noisy version of x_start.
+        """
+        if noise is None:
+            noise = th.randn_like(x_start)
+        assert noise.shape == x_start.shape
+        return (
+            _extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + _extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise
+        )
+
+    def q_posterior_mean_variance(self, x_start, x_t, t):
+        """
+        Compute the mean and variance of the diffusion posterior:
+            q(x_{t-1} | x_t, x_0)
+        """
+        assert x_start.shape == x_t.shape
+        posterior_mean = (
+            _extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start
+            + _extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t
+        )
+        posterior_variance = _extract_into_tensor(self.posterior_variance, t, x_t.shape)
+        posterior_log_variance_clipped = self.posterior_log_variance_clipped
+        # posterior_log_variance_clipped = _extract_into_tensor(
+        #     self.posterior_log_variance_clipped, t, x_t.shape
+        # )
+        assert (
+            posterior_mean.shape[0]
+            == posterior_variance.shape[0]
+            # == posterior_log_variance_clipped.shape[0]
+            == x_start.shape[0]
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self, model, x, t, clip_denoised=True, denoised_fn=None, model_kwargs=None):
+        """
+        Apply the model to get p(x_{t-1} | x_t), as well as a prediction of
+        the initial x, x_0.
+        :param model: the model, which takes a signal and a batch of timesteps
+                      as input.
+        :param x: the [N x C x ...] tensor at time t.
+        :param t: a 1-D Tensor of timesteps.
+        :param clip_denoised: if True, clip the denoised signal into [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample. Applies before
+            clip_denoised.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict with the following keys:
+                 - 'mean': the model mean output.
+                 - 'variance': the model variance output.
+                 - 'log_variance': the log of 'variance'.
+                 - 'pred_xstart': the prediction for x_0.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+
+        B, C = x.shape[:2]
+        assert t.shape == (B,)
+        model_output = model(x, t, **model_kwargs)
+        if isinstance(model_output, tuple):
+            model_output, extra = model_output
+        else:
+            extra = None
+
+        if self.model_var_type in [ModelVarType.LEARNED, ModelVarType.LEARNED_RANGE]:
+            assert model_output.shape == (B, C * 2, *x.shape[2:])
+            model_output, model_var_values = th.split(model_output, C, dim=1)
+            min_log = _extract_into_tensor(self.posterior_log_variance_clipped, t, x.shape)
+            max_log = _extract_into_tensor(np.log(self.betas), t, x.shape)
+            # The model_var_values is [-1, 1] for [min_var, max_var].
+            frac = (model_var_values + 1) / 2
+            model_log_variance = frac * max_log + (1 - frac) * min_log
+            model_variance = th.exp(model_log_variance)
+        else:
+            if len(self.betas) == 1:
+                model_variance, model_log_variance = {
+                    ModelVarType.FIXED_SMALL: (
+                        self.posterior_variance,
+                        self.posterior_log_variance_clipped,
+                    ),
+                }[self.model_var_type]
+                model_variance = _extract_into_tensor(model_variance, t, x.shape)
+            else:
+                model_variance, model_log_variance = {
+                    # for fixedlarge, we set the initial (log-)variance like so
+                    # to get a better decoder log likelihood.
+                    ModelVarType.FIXED_LARGE: (
+                        np.append(self.posterior_variance[1], self.betas[1:]),
+                        np.log(np.append(self.posterior_variance[1], self.betas[1:])),
+                    ),
+                    ModelVarType.FIXED_SMALL: (
+                        self.posterior_variance,
+                        self.posterior_log_variance_clipped,
+                    ),
+                }[self.model_var_type]
+                model_variance = _extract_into_tensor(model_variance, t, x.shape)
+                model_log_variance = _extract_into_tensor(model_log_variance, t, x.shape)
+
+        def process_xstart(x):
+            if denoised_fn is not None:
+                x = denoised_fn(x)
+            if clip_denoised:
+                return x.clamp(-1, 1)
+            return x
+
+        if self.model_mean_type == ModelMeanType.START_X:
+            pred_xstart = process_xstart(model_output)
+        else:
+            pred_xstart = process_xstart(self._predict_xstart_from_eps(x_t=x, t=t, eps=model_output))
+        model_mean, _, _ = self.q_posterior_mean_variance(x_start=pred_xstart, x_t=x, t=t)
+
+        assert model_mean.shape == pred_xstart.shape == x.shape  # == model_log_variance.shape
+        return {
+            "mean": model_mean,
+            "variance": model_variance,
+            "log_variance": model_log_variance,
+            "pred_xstart": pred_xstart,
+            "extra": extra,
+        }
+
+    def _predict_xstart_from_eps(self, x_t, t, eps):
+        assert x_t.shape == eps.shape
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t
+            - _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * eps
+        )
+
+    def _predict_eps_from_xstart(self, x_t, t, pred_xstart):
+        return (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - pred_xstart
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+
+    def condition_mean(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute the mean for the previous step, given a function cond_fn that
+        computes the gradient of a conditional log probability with respect to
+        x. In particular, cond_fn computes grad(log(p(y|x))), and we want to
+        condition on y.
+        This uses the conditioning strategy from Sohl-Dickstein et al. (2015).
+        """
+        gradient = cond_fn(x, t, **model_kwargs)
+        new_mean = p_mean_var["mean"].float() + p_mean_var["variance"] * gradient.float()
+        return new_mean
+
+    def condition_score(self, cond_fn, p_mean_var, x, t, model_kwargs=None):
+        """
+        Compute what the p_mean_variance output would have been, should the
+        model's score function be conditioned by cond_fn.
+        See condition_mean() for details on cond_fn.
+        Unlike condition_mean(), this instead uses the conditioning strategy
+        from Song et al (2020).
+        """
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+
+        eps = self._predict_eps_from_xstart(x, t, p_mean_var["pred_xstart"])
+        eps = eps - (1 - alpha_bar).sqrt() * cond_fn(x, t, **model_kwargs)
+
+        out = p_mean_var.copy()
+        out["pred_xstart"] = self._predict_xstart_from_eps(x, t, eps)
+        out["mean"], _, _ = self.q_posterior_mean_variance(x_start=out["pred_xstart"], x_t=x, t=t)
+        return out
+
+    def p_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+    ):
+        """
+        Sample x_{t-1} from the model at the given timestep.
+        :param model: the model to sample from.
+        :param x: the current tensor at x_{t-1}.
+        :param t: the value of t, starting at 0 for the first diffusion step.
+        :param clip_denoised: if True, clip the x_start prediction to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - 'sample': a random sample from the model.
+                 - 'pred_xstart': a prediction of x_0.
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        noise = th.randn_like(x)
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
+        if cond_fn is not None:
+            out["mean"] = self.condition_mean(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        sample = out["mean"] + nonzero_mask * th.exp(0.5 * out["log_variance"]) * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+
+    def p_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model.
+        :param model: the model module.
+        :param shape: the shape of the samples, (N, C, H, W).
+        :param noise: if specified, the noise from the encoder to sample.
+                      Should be of the same shape as `shape`.
+        :param clip_denoised: if True, clip x_start predictions to [-1, 1].
+        :param denoised_fn: if not None, a function which applies to the
+            x_start prediction before it is used to sample.
+        :param cond_fn: if not None, this is a gradient function that acts
+                        similarly to the model.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param device: if specified, the device to create the samples on.
+                       If not specified, use a model parameter's device.
+        :param progress: if True, show a tqdm progress bar.
+        :return: a non-differentiable batch of samples.
+        """
+        final = None
+        for sample in self.p_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+        ):
+            final = sample
+        return final["sample"]
+
+    def p_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+    ):
+        """
+        Generate samples from the model and yield intermediate samples from
+        each timestep of diffusion.
+        Arguments are the same as p_sample_loop().
+        Returns a generator over dicts, where each dict is the return value of
+        p_sample().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm(indices)
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.p_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                )
+                yield out
+                img = out["sample"]
+
+    def ddim_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t-1} from the model using DDIM.
+        Same usage as p_sample().
+        """
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = self._predict_eps_from_xstart(x, t, out["pred_xstart"])
+
+        alpha_bar = _extract_into_tensor(self.alphas_cumprod, t, x.shape)
+        alpha_bar_prev = _extract_into_tensor(self.alphas_cumprod_prev, t, x.shape)
+        sigma = eta * th.sqrt((1 - alpha_bar_prev) / (1 - alpha_bar)) * th.sqrt(1 - alpha_bar / alpha_bar_prev)
+        # Equation 12.
+        noise = th.randn_like(x)
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_prev) + th.sqrt(1 - alpha_bar_prev - sigma**2) * eps
+        nonzero_mask = (t != 0).float().view(-1, *([1] * (len(x.shape) - 1)))  # no noise when t == 0
+        sample = mean_pred + nonzero_mask * sigma * noise
+        return {"sample": sample, "pred_xstart": out["pred_xstart"]}
+
+    def ddim_reverse_sample(
+        self,
+        model,
+        x,
+        t,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        eta=0.0,
+    ):
+        """
+        Sample x_{t+1} from the model using DDIM reverse ODE.
+        """
+        assert eta == 0.0, "Reverse ODE only for deterministic path"
+        out = self.p_mean_variance(
+            model,
+            x,
+            t,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            model_kwargs=model_kwargs,
+        )
+        if cond_fn is not None:
+            out = self.condition_score(cond_fn, out, x, t, model_kwargs=model_kwargs)
+        # Usually our model outputs epsilon, but we re-derive it
+        # in case we used x_start or x_prev prediction.
+        eps = (
+            _extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x.shape) * x - out["pred_xstart"]
+        ) / _extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x.shape)
+        alpha_bar_next = _extract_into_tensor(self.alphas_cumprod_next, t, x.shape)
+
+        # Equation 12. reversed
+        mean_pred = out["pred_xstart"] * th.sqrt(alpha_bar_next) + th.sqrt(1 - alpha_bar_next) * eps
+
+        return {"sample": mean_pred, "pred_xstart": out["pred_xstart"]}
+
+    def ddim_sample_loop(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Generate samples from the model using DDIM.
+        Same usage as p_sample_loop().
+        """
+        final = None
+        for sample in self.ddim_sample_loop_progressive(
+            model,
+            shape,
+            noise=noise,
+            clip_denoised=clip_denoised,
+            denoised_fn=denoised_fn,
+            cond_fn=cond_fn,
+            model_kwargs=model_kwargs,
+            device=device,
+            progress=progress,
+            eta=eta,
+        ):
+            final = sample
+        return final["sample"]
+
+    def ddim_sample_loop_progressive(
+        self,
+        model,
+        shape,
+        noise=None,
+        clip_denoised=True,
+        denoised_fn=None,
+        cond_fn=None,
+        model_kwargs=None,
+        device=None,
+        progress=False,
+        eta=0.0,
+    ):
+        """
+        Use DDIM to sample from the model and yield intermediate samples from
+        each timestep of DDIM.
+        Same usage as p_sample_loop_progressive().
+        """
+        if device is None:
+            device = next(model.parameters()).device
+        assert isinstance(shape, (tuple, list))
+        if noise is not None:
+            img = noise
+        else:
+            img = th.randn(*shape, device=device)
+        indices = list(range(self.num_timesteps))[::-1]
+
+        if progress:
+            # Lazy import so that we don't depend on tqdm.
+            from tqdm.auto import tqdm
+
+            indices = tqdm(indices)
+
+        for i in indices:
+            t = th.tensor([i] * shape[0], device=device)
+            with th.no_grad():
+                out = self.ddim_sample(
+                    model,
+                    img,
+                    t,
+                    clip_denoised=clip_denoised,
+                    denoised_fn=denoised_fn,
+                    cond_fn=cond_fn,
+                    model_kwargs=model_kwargs,
+                    eta=eta,
+                )
+                yield out
+                img = out["sample"]
+
+    def _vb_terms_bpd(self, model, x_start, x_t, t, clip_denoised=True, model_kwargs=None):
+        """
+        Get a term for the variational lower-bound.
+        The resulting units are bits (rather than nats, as one might expect).
+        This allows for comparison to other papers.
+        :return: a dict with the following keys:
+                 - 'output': a shape [N] tensor of NLLs or KLs.
+                 - 'pred_xstart': the x_0 predictions.
+        """
+        true_mean, _, true_log_variance_clipped = self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)
+        out = self.p_mean_variance(model, x_t, t, clip_denoised=clip_denoised, model_kwargs=model_kwargs)
+        kl = normal_kl(true_mean, true_log_variance_clipped, out["mean"], out["log_variance"])
+        kl = mean_flat(kl) / np.log(2.0)
+
+        decoder_nll = -discretized_gaussian_log_likelihood(
+            x_start, means=out["mean"], log_scales=0.5 * out["log_variance"]
+        )
+        assert decoder_nll.shape == x_start.shape
+        decoder_nll = mean_flat(decoder_nll) / np.log(2.0)
+
+        # At the first timestep return the decoder NLL,
+        # otherwise return KL(q(x_{t-1}|x_t,x_0) || p(x_{t-1}|x_t))
+        output = th.where((t == 0), decoder_nll, kl)
+        return {"output": output, "pred_xstart": out["pred_xstart"]}
+
+    def training_losses(self, model, x_start, t, model_kwargs=None, noise=None):
+        """
+        Compute training losses for a single timestep.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param t: a batch of timestep indices.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :param noise: if specified, the specific Gaussian noise to try to remove.
+        :return: a dict with the key "loss" containing a tensor of shape [N].
+                 Some mean or variance settings may also have other keys.
+        """
+        if model_kwargs is None:
+            model_kwargs = {}
+        if noise is None:
+            noise = th.randn_like(x_start)
+        x_t = self.q_sample(x_start, t, noise=noise)
+
+        terms = {}
+
+        if self.loss_type == LossType.KL or self.loss_type == LossType.RESCALED_KL:
+            terms["loss"] = self._vb_terms_bpd(
+                model=model,
+                x_start=x_start,
+                x_t=x_t,
+                t=t,
+                clip_denoised=False,
+                model_kwargs=model_kwargs,
+            )["output"]
+            if self.loss_type == LossType.RESCALED_KL:
+                terms["loss"] *= self.num_timesteps
+        elif self.loss_type == LossType.MSE or self.loss_type == LossType.RESCALED_MSE:
+            model_output = model(x_t, t, **model_kwargs)
+
+            if self.model_var_type in [
+                ModelVarType.LEARNED,
+                ModelVarType.LEARNED_RANGE,
+            ]:
+                B, C = x_t.shape[:2]
+                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
+                model_output, model_var_values = th.split(model_output, C, dim=1)
+                frozen_out = th.cat([model_output.detach(), model_var_values], dim=1)
+                terms["vb"] = self._vb_terms_bpd(
+                    model=lambda *args, r=frozen_out: r,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t,
+                    clip_denoised=False,
+                )["output"]
+                if self.loss_type == LossType.RESCALED_MSE:
+                    # Divide by 1000 for equivalence with initial implementation.
+                    # Without a factor of 1/1000, the VB term hurts the MSE term.
+                    terms["vb"] *= self.num_timesteps / 1000.0
+
+            target = {
+                ModelMeanType.PREVIOUS_X: self.q_posterior_mean_variance(x_start=x_start, x_t=x_t, t=t)[0],
+                ModelMeanType.START_X: x_start,
+                ModelMeanType.EPSILON: noise,
+            }[self.model_mean_type]
+            assert model_output.shape == target.shape == x_start.shape
+            terms["mse"] = mean_flat((target - model_output) ** 2)
+            if "vb" in terms:
+                terms["loss"] = terms["mse"] + terms["vb"]
+            else:
+                terms["loss"] = terms["mse"]
+        else:
+            raise NotImplementedError(self.loss_type)
+
+        return terms
+
+    def _prior_bpd(self, x_start):
+        """
+        Get the prior KL term for the variational lower-bound, measured in
+        bits-per-dim.
+        This term can't be optimized, as it only depends on the encoder.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :return: a batch of [N] KL values (in bits), one per batch element.
+        """
+        batch_size = x_start.shape[0]
+        t = th.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device)
+        qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t)
+        kl_prior = normal_kl(mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0)
+        return mean_flat(kl_prior) / np.log(2.0)
+
+    def calc_bpd_loop(self, model, x_start, clip_denoised=True, model_kwargs=None):
+        """
+        Compute the entire variational lower-bound, measured in bits-per-dim,
+        as well as other related quantities.
+        :param model: the model to evaluate loss on.
+        :param x_start: the [N x C x ...] tensor of inputs.
+        :param clip_denoised: if True, clip denoised samples.
+        :param model_kwargs: if not None, a dict of extra keyword arguments to
+            pass to the model. This can be used for conditioning.
+        :return: a dict containing the following keys:
+                 - total_bpd: the total variational lower-bound, per batch element.
+                 - prior_bpd: the prior term in the lower-bound.
+                 - vb: an [N x T] tensor of terms in the lower-bound.
+                 - xstart_mse: an [N x T] tensor of x_0 MSEs for each timestep.
+                 - mse: an [N x T] tensor of epsilon MSEs for each timestep.
+        """
+        device = x_start.device
+        batch_size = x_start.shape[0]
+
+        vb = []
+        xstart_mse = []
+        mse = []
+        for t in list(range(self.num_timesteps))[::-1]:
+            t_batch = th.tensor([t] * batch_size, device=device)
+            noise = th.randn_like(x_start)
+            x_t = self.q_sample(x_start=x_start, t=t_batch, noise=noise)
+            # Calculate VLB term at the current timestep
+            with th.no_grad():
+                out = self._vb_terms_bpd(
+                    model,
+                    x_start=x_start,
+                    x_t=x_t,
+                    t=t_batch,
+                    clip_denoised=clip_denoised,
+                    model_kwargs=model_kwargs,
+                )
+            vb.append(out["output"])
+            xstart_mse.append(mean_flat((out["pred_xstart"] - x_start) ** 2))
+            eps = self._predict_eps_from_xstart(x_t, t_batch, out["pred_xstart"])
+            mse.append(mean_flat((eps - noise) ** 2))
+
+        vb = th.stack(vb, dim=1)
+        xstart_mse = th.stack(xstart_mse, dim=1)
+        mse = th.stack(mse, dim=1)
+
+        prior_bpd = self._prior_bpd(x_start)
+        total_bpd = vb.sum(dim=1) + prior_bpd
+        return {
+            "total_bpd": total_bpd,
+            "prior_bpd": prior_bpd,
+            "vb": vb,
+            "xstart_mse": xstart_mse,
+            "mse": mse,
+        }
+
+
+def _extract_into_tensor(arr, timesteps, broadcast_shape):
+    """
+    Extract values from a 1-D numpy array for a batch of indices.
+    :param arr: the 1-D numpy array.
+    :param timesteps: a tensor of indices into the array to extract.
+    :param broadcast_shape: a larger shape of K dimensions with the batch
+                            dimension equal to the length of timesteps.
+    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
+    """
+    res = th.from_numpy(arr).to(device=timesteps.device)[timesteps].float()
+    while len(res.shape) < len(broadcast_shape):
+        res = res[..., None]
+    return res + th.zeros(broadcast_shape, device=timesteps.device)
diff --git a/code/model/modules/action_model/DiT_modules/models.py b/code/model/modules/action_model/DiT_modules/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc03416455596c4988b8800a9c66d0846f63909b
--- /dev/null
+++ b/code/model/modules/action_model/DiT_modules/models.py
@@ -0,0 +1,553 @@
+# Modified from facebookresearch's DiT repos
+# DiT: https://github.com/facebookresearch/DiT/blob/main/models.py
+
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# --------------------------------------------------------
+# References:
+# GLIDE: https://github.com/openai/glide-text2im
+# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
+# --------------------------------------------------------
+
+import torch
+import torch.nn as nn
+import math
+from timm.models.vision_transformer import Attention, Mlp
+
+
+def modulate(x, shift, scale):
+    return x * (1 + scale) + shift
+
+
+#################################################################################
+#               Embedding Layers for Timesteps and conditions                 #
+#################################################################################
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+
+    def __init__(self, hidden_size, frequency_embedding_size=256):
+        super().__init__()
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, hidden_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    @staticmethod
+    def timestep_embedding(t, dim, max_period=10000):
+        """
+        Create sinusoidal timestep embeddings.
+        :param t: a 1-D Tensor of N indices, one per batch element.
+                          These may be fractional.
+        :param dim: the dimension of the output.
+        :param max_period: controls the minimum frequency of the embeddings.
+        :return: an (N, D) Tensor of positional embeddings.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+        half = dim // 2
+        freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+            device=t.device
+        )
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+        return embedding
+
+    def forward(self, t):
+        t_freq = self.timestep_embedding(t, self.frequency_embedding_size).to(next(self.mlp.parameters()).dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
+
+
+class LabelEmbedder(nn.Module):
+    """
+    Embeds conditions into vector representations. Also handles label dropout for classifier-free guidance.
+    """
+
+    def __init__(self, in_size, hidden_size, dropout_prob=0.1, conditions_shape=(1, 1, 4096)):
+        super().__init__()
+        self.linear = nn.Linear(in_size, hidden_size)
+        self.dropout_prob = dropout_prob
+        if dropout_prob > 0:
+            self.uncondition = nn.Parameter(torch.empty(conditions_shape[1:]))
+
+    def token_drop(self, conditions, force_drop_ids=None):
+        """
+        Drops conditions to enable classifier-free guidance.
+        """
+        if force_drop_ids is None:
+            drop_ids = torch.rand(conditions.shape[0], device=conditions.device) < self.dropout_prob
+        else:
+            drop_ids = force_drop_ids == 1
+        conditions = torch.where(
+            drop_ids.unsqueeze(1).unsqueeze(1).expand(conditions.shape[0], *self.uncondition.shape),
+            self.uncondition,
+            conditions,
+        )
+        return conditions
+
+    def forward(self, conditions, train, force_drop_ids=None):
+        use_dropout = self.dropout_prob > 0
+        if (train and use_dropout) or (force_drop_ids is not None):
+            conditions = self.token_drop(conditions, force_drop_ids)
+        embeddings = self.linear(conditions)
+        return embeddings
+
+
+#################################################################################
+#                      Embedding Layers for Actions and                         #
+#################################################################################
+class ActionEmbedder(nn.Module):
+    def __init__(self, action_size, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(action_size, hidden_size)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+
+# Action_History is not used now
+class HistoryEmbedder(nn.Module):
+    def __init__(self, action_size, hidden_size):
+        super().__init__()
+        self.linear = nn.Linear(action_size, hidden_size)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return x
+
+
+#################################################################################
+#                                 Core DiT Model                                #
+#################################################################################
+
+
+class DiTBlock(nn.Module):
+    """
+    A DiT block with self-attention conditioning.
+    """
+
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class FinalLayer(nn.Module):
+    """
+    The final layer of DiT.
+    """
+
+    def __init__(self, hidden_size, out_channels):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, out_channels, bias=True)
+
+    def forward(self, x):
+        x = self.norm_final(x)
+        x = self.linear(x)
+        return x
+
+
+class DiT(nn.Module):
+    """
+    Diffusion model with a Transformer backbone.
+    """
+
+    def __init__(
+        self,
+        in_channels=7,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        token_size=4096,
+        future_action_window_size=1,
+        past_action_window_size=0,
+        learn_sigma=False,
+        n_conditon_token=64,
+    ):
+        super().__init__()
+
+        assert past_action_window_size == 0, "Error: action_history is not used now"
+        self.num_cond_tokens = n_conditon_token
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.class_dropout_prob = class_dropout_prob
+        self.num_heads = num_heads
+        self.past_action_window_size = past_action_window_size
+        self.future_action_window_size = future_action_window_size
+
+        # Action history is not used now.
+        self.history_embedder = HistoryEmbedder(action_size=in_channels, hidden_size=token_size)
+
+        self.x_embedder = ActionEmbedder(action_size=in_channels, hidden_size=token_size)
+        self.t_embedder = TimestepEmbedder(token_size)
+        conditions_shape = (1, n_conditon_token, token_size)
+
+        self.z_embedder = LabelEmbedder(
+            in_size=token_size,
+            hidden_size=token_size,
+            dropout_prob=class_dropout_prob,
+            conditions_shape=conditions_shape,
+        )
+        scale = token_size**-0.5
+
+        # Learnable positional embeddings
+        # 1+64, one for the conditional token, and one for the current action prediction
+        self.positional_embedding = nn.Parameter(
+            scale
+            * torch.randn(self.num_cond_tokens + future_action_window_size + past_action_window_size + 1, token_size)
+        )
+
+        self.blocks = nn.ModuleList([DiTBlock(token_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)])
+        self.final_layer = FinalLayer(token_size, self.out_channels)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # # Initialize token_embed like nn.Linear
+        nn.init.normal_(self.x_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.x_embedder.linear.bias, 0)
+
+        nn.init.normal_(self.history_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.history_embedder.linear.bias, 0)
+
+        # Initialize label embedding table:
+        if self.class_dropout_prob > 0:
+            nn.init.normal_(self.z_embedder.uncondition, std=0.02)
+        nn.init.normal_(self.z_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.z_embedder.linear.bias, 0)
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+    def forward(self, x, t, z):
+        """
+        Forward pass of DiT.
+        history: (B, H, D) tensor of action history # not used now
+        x: (B, T, D) tensor of predicting action inputs
+        t: (B,) tensor of diffusion timesteps
+        z: [B, num_cond_tokens, D] -- condition token
+        """
+        x = self.x_embedder(x)  # (N, T, D)
+        t = self.t_embedder(t)  # (N, D)
+        z = self.z_embedder(z, self.training)  # [N, num_cond_tokens, D]
+        c = t.unsqueeze(1) + z  # (N, 64, D)
+        x = torch.cat((c, x), dim=1)  # (N, T+64, D)
+        x = x + self.positional_embedding  # (N, T+64, D)
+        for block in self.blocks:
+            x = block(x)  # (N, T+64, D)
+        x = self.final_layer(x)  # (N, T+64, out_channels)
+        return x[:, self.num_cond_tokens :, :]  # (N, T, C)
+
+    def forward_with_cfg(self, x, t, z, cfg_scale):
+        """
+        Forward pass of Diffusion, but also batches the unconditional forward pass for classifier-free guidance.
+        """
+
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0).to(next(self.x_embedder.parameters()).dtype)
+        model_out = self.forward(combined, t, z)
+        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
+        eps, rest = model_out[:, :, : self.in_channels], model_out[:, :, self.in_channels :]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        # return torch.cat([eps, rest], dim=1)
+        return torch.cat([eps, rest], dim=2)
+
+
+# Cross-Attention DiT Implementation
+
+
+class CrossAttention(nn.Module):
+    """
+    Cross-attention module that supports both self-attention and cross-attention.
+    """
+
+    def __init__(self, hidden_size, num_heads, qkv_bias=True, attn_drop=0.0, proj_drop=0.0):
+        super().__init__()
+        assert hidden_size % num_heads == 0
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = self.head_dim**-0.5
+
+        self.q = nn.Linear(hidden_size, hidden_size, bias=qkv_bias)
+        self.kv = nn.Linear(hidden_size, hidden_size * 2, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(hidden_size, hidden_size)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+    def forward(self, x, context=None):
+        """
+        Args:
+            x: query tensor [B, N, C]
+            context: key/value tensor [B, M, C]. If None, performs self-attention
+        """
+        B, N, C = x.shape
+
+        # Query from x
+        q = self.q(x).reshape(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+
+        # Key and Value from context (or x if self-attention)
+        if context is None:
+            context = x
+        M = context.shape[1]
+        kv = self.kv(context).reshape(B, M, 2, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        k, v = kv.unbind(0)
+
+        # Attention computation
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class DiTBlockCrossAttn(nn.Module):
+    """
+    A DiT block with only cross-attention + MLP.
+    """
+
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+
+        # Cross-attention components
+        self.norm_attn = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.cross_attn = CrossAttention(hidden_size, num_heads=num_heads, **block_kwargs)
+
+        # MLP components
+        self.norm_mlp = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+
+    def forward(self, x, encoder_features=None):
+        """
+        Args:
+            x: input tensor [B, N, C] (action-related tokens)
+            encoder_features: encoder features [B, M, C] (e.g., vision-language features)
+        """
+        # Cross-attention (if encoder features provided) or self-attention (if None)
+        if encoder_features is not None:
+            # Cross-attention: Query from x, Key/Value from encoder_features
+            x = x + self.cross_attn(self.norm_attn(x), context=encoder_features)
+        else:
+            # Self-attention: Query, Key, Value all from x (for backward compatibility)
+            x = x + self.cross_attn(self.norm_attn(x), context=None)
+
+        # MLP
+        x = x + self.mlp(self.norm_mlp(x))
+        return x
+
+
+class DiTBlockSelfAttn(nn.Module):
+    """
+    A DiT block with only self-attention + MLP.
+    """
+
+    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
+        super().__init__()
+
+        # Self-attention components (same as original DiTBlock)
+        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
+
+        # MLP components (same as original DiTBlock)
+        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        approx_gelu = lambda: nn.GELU(approximate="tanh")
+        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
+
+    def forward(self, x, encoder_features=None):
+        """
+        Args:
+            x: input tensor [B, N, C] (action-related tokens)
+            encoder_features: encoder features [B, M, C] (not used in self-attention, for interface compatibility)
+        """
+        # Self-attention (identical to original DiTBlock)
+        x = x + self.attn(self.norm1(x))
+
+        # MLP (identical to original DiTBlock)
+        x = x + self.mlp(self.norm2(x))
+        return x
+
+
+class DiTCrossAttn(nn.Module):
+    """
+    Diffusion model with a Transformer backbone supporting cross-attention.
+    """
+
+    def __init__(
+        self,
+        in_channels=7,
+        hidden_size=1152,
+        depth=28,
+        num_heads=16,
+        mlp_ratio=4.0,
+        class_dropout_prob=0.1,
+        token_size=4096,
+        future_action_window_size=1,
+        past_action_window_size=0,
+        learn_sigma=False,
+        n_conditon_token=64,
+    ):
+        super().__init__()
+
+        assert past_action_window_size == 0, "Error: action_history is not used now"
+        self.num_cond_tokens = n_conditon_token
+        self.learn_sigma = learn_sigma
+        self.in_channels = in_channels
+        self.out_channels = in_channels * 2 if learn_sigma else in_channels
+        self.class_dropout_prob = class_dropout_prob
+        self.num_heads = num_heads
+        self.past_action_window_size = past_action_window_size
+        self.future_action_window_size = future_action_window_size
+
+        # Action history is not used now.
+        self.history_embedder = HistoryEmbedder(action_size=in_channels, hidden_size=token_size)
+
+        self.x_embedder = ActionEmbedder(action_size=in_channels, hidden_size=token_size)
+        self.t_embedder = TimestepEmbedder(token_size)
+        conditions_shape = (1, n_conditon_token, token_size)
+
+        self.z_embedder = LabelEmbedder(
+            in_size=token_size,
+            hidden_size=token_size,
+            dropout_prob=class_dropout_prob,
+            conditions_shape=conditions_shape,
+        )
+        scale = token_size**-0.5
+
+        # Learnable positional embeddings
+        actual_action_length = future_action_window_size + past_action_window_size + 1
+        self.positional_embedding = nn.Parameter(
+            scale * torch.randn(self.num_cond_tokens + actual_action_length, token_size)
+        )
+
+        # Alternating cross-attention and self-attention blocks
+        self.blocks = nn.ModuleList()
+        for layer_idx in range(depth):
+            if layer_idx % 2 == 0:  # Even layers (0, 2, 4, ...): Cross-Attention
+                block = DiTBlockCrossAttn(token_size, num_heads, mlp_ratio=mlp_ratio)
+            else:  # Odd layers (1, 3, 5, ...): Self-Attention
+                block = DiTBlockSelfAttn(token_size, num_heads, mlp_ratio=mlp_ratio)
+            self.blocks.append(block)
+        self.final_layer = FinalLayer(token_size, self.out_channels)
+        self.initialize_weights()
+
+    def initialize_weights(self):
+        # Initialize transformer layers:
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+
+        self.apply(_basic_init)
+
+        # Initialize embedders
+        nn.init.normal_(self.x_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.x_embedder.linear.bias, 0)
+
+        nn.init.normal_(self.history_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.history_embedder.linear.bias, 0)
+
+        # Initialize label embedding table:
+        if self.class_dropout_prob > 0:
+            nn.init.normal_(self.z_embedder.uncondition, std=0.02)
+        nn.init.normal_(self.z_embedder.linear.weight, std=0.02)
+        nn.init.constant_(self.z_embedder.linear.bias, 0)
+
+        # Initialize timestep embedding MLP:
+        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
+        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)
+
+        nn.init.constant_(self.final_layer.linear.weight, 0)
+        nn.init.constant_(self.final_layer.linear.bias, 0)
+
+    def forward(self, x, t, z, encoder_features=None):
+        """
+        Forward pass of DiT with cross-attention.
+        Args:
+            x: (B, T, D) tensor of predicting action inputs
+            t: (B,) tensor of diffusion timesteps
+            z: [B, num_cond_tokens, D] -- condition token
+            encoder_features: [B, M, D] -- encoder features for cross-attention (e.g., vision-language features)
+        """
+        x = self.x_embedder(x)  # (N, T, D)
+        t = self.t_embedder(t)  # (N, D)
+        z = self.z_embedder(z, self.training)  # [N, num_cond_tokens, D]
+        c = t.unsqueeze(1) + z  # (N, 64, D)
+        x = torch.cat((c, x), dim=1)  # (N, T+64, D)
+        x = x + self.positional_embedding  # (N, T+64, D)
+
+        # Pass through cross-attention blocks
+        for block in self.blocks:
+            x = block(x, encoder_features=encoder_features)  # (N, T+64, D)
+
+        x = self.final_layer(x)  # (N, T+64, out_channels)
+        return x[:, self.num_cond_tokens :, :]  # (N, T, C)
+
+    def forward_with_cfg(self, x, t, z, cfg_scale, encoder_features=None):
+        """
+        Forward pass with classifier-free guidance for cross-attention DiT.
+        """
+        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
+        half = x[: len(x) // 2]
+        combined = torch.cat([half, half], dim=0).to(next(self.x_embedder.parameters()).dtype)
+
+        # Handle encoder features for CFG: conditional + unconditional
+        if encoder_features is not None:
+            # First half: conditional (with encoder features)
+            # Second half: unconditional (without encoder features, set to None)
+            encoder_features_combined = torch.cat([encoder_features, encoder_features], dim=0)
+            # Note: For true CFG, you might want to pass None for the second half:
+            # But this would require modifying the forward pass to handle mixed batches
+        else:
+            encoder_features_combined = None
+
+        model_out = self.forward(combined, t, z, encoder_features=encoder_features_combined)
+        eps, rest = model_out[:, :, : self.in_channels], model_out[:, :, self.in_channels :]
+        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
+        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
+        eps = torch.cat([half_eps, half_eps], dim=0)
+        return torch.cat([eps, rest], dim=2)
diff --git a/code/model/modules/action_model/DiT_modules/respace.py b/code/model/modules/action_model/DiT_modules/respace.py
new file mode 100644
index 0000000000000000000000000000000000000000..612c4d9eb4a0b9c65821a0e5fbed102588cab886
--- /dev/null
+++ b/code/model/modules/action_model/DiT_modules/respace.py
@@ -0,0 +1,121 @@
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+import numpy as np
+import torch as th
+
+from .gaussian_diffusion import GaussianDiffusion
+
+
+def space_timesteps(num_timesteps, section_counts):
+    """
+    Create a list of timesteps to use from an original diffusion process,
+    given the number of timesteps we want to take from equally-sized portions
+    of the original process.
+    For example, if there's 300 timesteps and the section counts are [10,15,20]
+    then the first 100 timesteps are strided to be 10 timesteps, the second 100
+    are strided to be 15 timesteps, and the final 100 are strided to be 20.
+    If the stride is a string starting with "ddim", then the fixed striding
+    from the DDIM paper is used, and only one section is allowed.
+    :param num_timesteps: the number of diffusion steps in the original
+                          process to divide up.
+    :param section_counts: either a list of numbers, or a string containing
+                           comma-separated numbers, indicating the step count
+                           per section. As a special case, use "ddimN" where N
+                           is a number of steps to use the striding from the
+                           DDIM paper.
+    :return: a set of diffusion steps from the original process to use.
+    """
+    if isinstance(section_counts, str):
+        if section_counts.startswith("ddim"):
+            desired_count = int(section_counts[len("ddim") :])
+            if desired_count == 1:
+                return set([50])
+            for i in range(1, num_timesteps):
+                if len(range(0, num_timesteps, i)) == desired_count:
+                    return set(range(0, num_timesteps, i))
+            raise ValueError(f"cannot create exactly {num_timesteps} steps with an integer stride")
+        section_counts = [int(x) for x in section_counts.split(",")]
+    size_per = num_timesteps // len(section_counts)
+    extra = num_timesteps % len(section_counts)
+    start_idx = 0
+    all_steps = []
+    for i, section_count in enumerate(section_counts):
+        size = size_per + (1 if i < extra else 0)
+        if size < section_count:
+            raise ValueError(f"cannot divide section of {size} steps into {section_count}")
+        if section_count <= 1:
+            frac_stride = 1
+        else:
+            frac_stride = (size - 1) / (section_count - 1)
+        cur_idx = 0.0
+        taken_steps = []
+        for _ in range(section_count):
+            taken_steps.append(start_idx + round(cur_idx))
+            cur_idx += frac_stride
+        all_steps += taken_steps
+        start_idx += size
+    return set(all_steps)
+
+
+class SpacedDiffusion(GaussianDiffusion):
+    """
+    A diffusion process which can skip steps in a base diffusion process.
+    :param use_timesteps: a collection (sequence or set) of timesteps from the
+                          original diffusion process to retain.
+    :param kwargs: the kwargs to create the base diffusion process.
+    """
+
+    def __init__(self, use_timesteps, **kwargs):
+        self.use_timesteps = set(use_timesteps)
+        self.timestep_map = []
+        self.original_num_steps = len(kwargs["betas"])
+
+        base_diffusion = GaussianDiffusion(**kwargs)  # pylint: disable=missing-kwoa
+        last_alpha_cumprod = 1.0
+        new_betas = []
+        for i, alpha_cumprod in enumerate(base_diffusion.alphas_cumprod):
+            if i in self.use_timesteps:
+                new_betas.append(1 - alpha_cumprod / last_alpha_cumprod)
+                last_alpha_cumprod = alpha_cumprod
+                self.timestep_map.append(i)
+        kwargs["betas"] = np.array(new_betas)
+        super().__init__(**kwargs)
+
+    def p_mean_variance(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().p_mean_variance(self._wrap_model(model), *args, **kwargs)
+
+    def training_losses(self, model, *args, **kwargs):  # pylint: disable=signature-differs
+        return super().training_losses(self._wrap_model(model), *args, **kwargs)
+
+    def condition_mean(self, cond_fn, *args, **kwargs):
+        return super().condition_mean(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def condition_score(self, cond_fn, *args, **kwargs):
+        return super().condition_score(self._wrap_model(cond_fn), *args, **kwargs)
+
+    def _wrap_model(self, model):
+        if isinstance(model, _WrappedModel):
+            return model
+        return _WrappedModel(model, self.timestep_map, self.original_num_steps)
+
+    def _scale_timesteps(self, t):
+        # Scaling is done by the wrapped model.
+        return t
+
+
+class _WrappedModel:
+    def __init__(self, model, timestep_map, original_num_steps):
+        self.model = model
+        self.timestep_map = timestep_map
+        # self.rescale_timesteps = rescale_timesteps
+        self.original_num_steps = original_num_steps
+
+    def __call__(self, x, ts, **kwargs):
+        map_tensor = th.tensor(self.timestep_map, device=ts.device, dtype=ts.dtype)
+        new_ts = map_tensor[ts]
+        # if self.rescale_timesteps:
+        #     new_ts = new_ts.float() * (1000.0 / self.original_num_steps)
+        return self.model(x, new_ts, **kwargs)
diff --git a/code/model/modules/action_model/DiT_modules/timestep_sampler.py b/code/model/modules/action_model/DiT_modules/timestep_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdaa45acfcf239d7b6aaf5a83ee12fd553bc06b8
--- /dev/null
+++ b/code/model/modules/action_model/DiT_modules/timestep_sampler.py
@@ -0,0 +1,143 @@
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+from abc import ABC, abstractmethod
+
+import numpy as np
+import torch as th
+import torch.distributed as dist
+
+
+def create_named_schedule_sampler(name, diffusion):
+    """
+    Create a ScheduleSampler from a library of pre-defined samplers.
+    :param name: the name of the sampler.
+    :param diffusion: the diffusion object to sample for.
+    """
+    if name == "uniform":
+        return UniformSampler(diffusion)
+    elif name == "loss-second-moment":
+        return LossSecondMomentResampler(diffusion)
+    else:
+        raise NotImplementedError(f"unknown schedule sampler: {name}")
+
+
+class ScheduleSampler(ABC):
+    """
+    A distribution over timesteps in the diffusion process, intended to reduce
+    variance of the objective.
+    By default, samplers perform unbiased importance sampling, in which the
+    objective's mean is unchanged.
+    However, subclasses may override sample() to change how the resampled
+    terms are reweighted, allowing for actual changes in the objective.
+    """
+
+    @abstractmethod
+    def weights(self):
+        """
+        Get a numpy array of weights, one per diffusion step.
+        The weights needn't be normalized, but must be positive.
+        """
+
+    def sample(self, batch_size, device):
+        """
+        Importance-sample timesteps for a batch.
+        :param batch_size: the number of timesteps.
+        :param device: the torch device to save to.
+        :return: a tuple (timesteps, weights):
+                 - timesteps: a tensor of timestep indices.
+                 - weights: a tensor of weights to scale the resulting losses.
+        """
+        w = self.weights()
+        p = w / np.sum(w)
+        indices_np = np.random.choice(len(p), size=(batch_size,), p=p)
+        indices = th.from_numpy(indices_np).long().to(device)
+        weights_np = 1 / (len(p) * p[indices_np])
+        weights = th.from_numpy(weights_np).float().to(device)
+        return indices, weights
+
+
+class UniformSampler(ScheduleSampler):
+    def __init__(self, diffusion):
+        self.diffusion = diffusion
+        self._weights = np.ones([diffusion.num_timesteps])
+
+    def weights(self):
+        return self._weights
+
+
+class LossAwareSampler(ScheduleSampler):
+    def update_with_local_losses(self, local_ts, local_losses):
+        """
+        Update the reweighting using losses from a model.
+        Call this method from each rank with a batch of timesteps and the
+        corresponding losses for each of those timesteps.
+        This method will perform synchronization to make sure all of the ranks
+        maintain the exact same reweighting.
+        :param local_ts: an integer Tensor of timesteps.
+        :param local_losses: a 1D Tensor of losses.
+        """
+        batch_sizes = [th.tensor([0], dtype=th.int32, device=local_ts.device) for _ in range(dist.get_world_size())]
+        dist.all_gather(
+            batch_sizes,
+            th.tensor([len(local_ts)], dtype=th.int32, device=local_ts.device),
+        )
+
+        # Pad all_gather batches to be the maximum batch size.
+        batch_sizes = [x.item() for x in batch_sizes]
+        max_bs = max(batch_sizes)
+
+        timestep_batches = [th.zeros(max_bs).to(local_ts) for bs in batch_sizes]
+        loss_batches = [th.zeros(max_bs).to(local_losses) for bs in batch_sizes]
+        dist.all_gather(timestep_batches, local_ts)
+        dist.all_gather(loss_batches, local_losses)
+        timesteps = [x.item() for y, bs in zip(timestep_batches, batch_sizes) for x in y[:bs]]
+        losses = [x.item() for y, bs in zip(loss_batches, batch_sizes) for x in y[:bs]]
+        self.update_with_all_losses(timesteps, losses)
+
+    @abstractmethod
+    def update_with_all_losses(self, ts, losses):
+        """
+        Update the reweighting using losses from a model.
+        Sub-classes should override this method to update the reweighting
+        using losses from the model.
+        This method directly updates the reweighting without synchronizing
+        between workers. It is called by update_with_local_losses from all
+        ranks with identical arguments. Thus, it should have deterministic
+        behavior to maintain state across workers.
+        :param ts: a list of int timesteps.
+        :param losses: a list of float losses, one per timestep.
+        """
+
+
+class LossSecondMomentResampler(LossAwareSampler):
+    def __init__(self, diffusion, history_per_term=10, uniform_prob=0.001):
+        self.diffusion = diffusion
+        self.history_per_term = history_per_term
+        self.uniform_prob = uniform_prob
+        self._loss_history = np.zeros([diffusion.num_timesteps, history_per_term], dtype=np.float64)
+        self._loss_counts = np.zeros([diffusion.num_timesteps], dtype=np.int)
+
+    def weights(self):
+        if not self._warmed_up():
+            return np.ones([self.diffusion.num_timesteps], dtype=np.float64)
+        weights = np.sqrt(np.mean(self._loss_history**2, axis=-1))
+        weights /= np.sum(weights)
+        weights *= 1 - self.uniform_prob
+        weights += self.uniform_prob / len(weights)
+        return weights
+
+    def update_with_all_losses(self, ts, losses):
+        for t, loss in zip(ts, losses):
+            if self._loss_counts[t] == self.history_per_term:
+                # Shift out the oldest loss term.
+                self._loss_history[t, :-1] = self._loss_history[t, 1:]
+                self._loss_history[t, -1] = loss
+            else:
+                self._loss_history[t, self._loss_counts[t]] = loss
+                self._loss_counts[t] += 1
+
+    def _warmed_up(self):
+        return (self._loss_counts == self.history_per_term).all()
diff --git a/code/model/modules/action_model/GR00T_ActionHeader.py b/code/model/modules/action_model/GR00T_ActionHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9faccbd9a43a8d1451fae56af9f5a04a57c8aa6
--- /dev/null
+++ b/code/model/modules/action_model/GR00T_ActionHeader.py
@@ -0,0 +1,405 @@
+# Copyright 2025 NVIDIA Corp. and affiliates. All rights reserved.
+# Modified by [Junqiu YU/ Fudan University] in [2025]. 
+# Modification: [rm and add some connect adapter to match with starVLA, e.g., "rm "].
+# Action repeat is inspired by CogACT
+
+
+
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.distributions import Beta
+from transformers import PretrainedConfig
+from transformers.feature_extraction_utils import BatchFeature
+
+from starVLA.model.modules.action_model.flow_matching_head.action_encoder import (
+    SinusoidalPositionalEncoding,
+    swish,
+)
+
+from starVLA.model.modules.action_model.flow_matching_head.cross_attention_dit import DiT
+
+# TODO try to meger DiT Modules with follow_match_head, they are just the same arch, but diff loss, use diffusers package will be simple
+
+class CategorySpecificLinear(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        # For each category, we have separate weights and biases.
+        self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim))
+        self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim))
+
+    def forward(self, x, cat_ids):
+        selected_W = self.W[cat_ids]
+        selected_b = self.b[cat_ids]
+        # import ipdb; ipdb.set_trace()
+        return torch.bmm(x, selected_W) + selected_b.unsqueeze(1)
+
+
+class CategorySpecificMLP(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim)
+        self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim)
+
+    def forward(self, x, cat_ids):
+        hidden = F.relu(self.layer1(x, cat_ids))
+        return self.layer2(hidden, cat_ids)
+
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.layer2 = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        return self.layer2(F.relu(self.layer1(x)))
+
+
+class ActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.action_dim = action_dim
+        self.layer1 = nn.Linear(action_dim, hidden_size)
+        self.layer2 = nn.Linear(2 * hidden_size, hidden_size)
+        self.layer3 = nn.Linear(hidden_size, hidden_size)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.layer1(actions)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then layer2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.layer2(x))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.layer3(x)
+        return x
+
+
+
+class MultiEmbodimentActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size, num_embodiments):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_embodiments = num_embodiments
+
+        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
+        self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size)  # (d -> w)
+        self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size)  # (2w -> w)
+        self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size)  # (w -> w)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps, cat_ids):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        cat_ids:   shape (B,)
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.W1(actions, cat_ids)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.W2(x, cat_ids))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.W3(x, cat_ids)
+        return x
+
+
+@dataclass
+class FlowmatchingActionHeadConfig(PretrainedConfig):
+    """NOTE: N1.5 uses XEmbFlowmatchingPolicyHeadConfig as action head"""
+
+    add_pos_embed: bool = field(
+        default=True, metadata={"help": "Whether to add positional embedding"}
+    )
+    diffusion_model_cfg: dict = field(
+        default=None, metadata={"help": "Diffusion model configuration."}
+    )
+    input_embedding_dim: int = field(
+        default=1536, metadata={"help": "Input embedding channel dimension."}
+    )
+
+    hidden_size: int = field(default=1024, metadata={"help": "Input embedding dimension."})
+    max_seq_len: int = field(default=1024, metadata={"help": "Maxium Sequence Length"})
+    action_dim: int = field(default=None, metadata={"help": "Action dimension."})
+    action_horizon: int = field(default=None, metadata={"help": "Action horizon."})
+    noise_beta_alpha: float = field(default=1.5, metadata={"help": ""})
+    noise_beta_beta: float = field(default=1.0, metadata={"help": ""})
+    noise_s: float = field(
+        default=0.999, metadata={"help": "Flow matching noise Beta distribution s."}
+    )
+    num_timestep_buckets: int = field(
+        default=1000, metadata={"help": "Number of timestep discretization buckets."}
+    )
+    num_inference_timesteps: int = field(
+        default=None,
+        metadata={"help": "Number of inference steps for noise diffusion."},
+    )
+    max_num_embodiments: int = field(default=32, metadata={"help": "Number of embodiments."})
+    tune_projector: bool = field(default=True, metadata={"help": "Whether to tune the projector."})
+    tune_diffusion_model: bool = field(
+        default=True, metadata={"help": "Whether to tune the diffusion model."}
+    )
+    load_pretrained_det_decode_layer_path: str = field(
+        default=None, metadata={"help": "Path to pretrained detection model."}
+    )
+    detection_coeff: float = field(default=1.0, metadata={"help": "Detection coefficient."})
+
+    freeze_decode_layer: bool = field(default=False)
+    expand_batch: int = field(default=None)
+    use_vlln: bool = field(default=True)
+
+    vl_self_attention_cfg: dict = field(default=None)
+    num_target_vision_tokens: int = field(
+        default=32, metadata={"help": "Number of target vision tokens."}
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+
+DiTConfig = {
+    "DiT-B": {"input_embedding_dim": 768, "attention_head_dim": 64, "num_attention_heads": 12},
+    "DiT-L": {"input_embedding_dim": 1536, "attention_head_dim": 48, "num_attention_heads": 32},
+}
+
+class FlowmatchingActionHead(nn.Module):
+    def __init__(
+        self,
+        full_config,
+    ):
+        super().__init__()
+        config = full_config.framework.action_model
+        self.hidden_size = config.hidden_size # @JinhuiYE
+        self.full_config = full_config
+        action_model_type = config.action_model_type
+        action_model_cfg = DiTConfig[action_model_type]
+        
+        self.input_embedding_dim = action_model_cfg["input_embedding_dim"]
+        diffusion_model_cfg = config.diffusion_model_cfg
+        diffusion_model_cfg = {**action_model_cfg, **diffusion_model_cfg}
+        self.model = DiT(**diffusion_model_cfg)
+        self.action_dim = config.action_dim
+        self.action_horizon = config.action_horizon
+        self.num_inference_timesteps = config.num_inference_timesteps
+
+        self.state_encoder = MLP(
+            input_dim=config.state_dim,
+            hidden_dim=self.hidden_size,
+            output_dim=self.input_embedding_dim,
+        ) if config.state_dim else None
+
+        self.action_encoder = ActionEncoder(
+            action_dim=config.action_dim,
+            hidden_size=self.input_embedding_dim,
+        )
+        self.action_decoder = MLP(
+            input_dim=self.model.config.output_dim,
+            hidden_dim=self.hidden_size,
+            output_dim=self.action_dim,
+        )
+        self.future_tokens = nn.Embedding(config.num_target_vision_tokens, self.input_embedding_dim)
+        nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02)
+
+        if config.add_pos_embed:
+            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+
+        self.beta_dist = Beta(config.noise_beta_alpha, config.noise_beta_beta)
+        self.num_timestep_buckets = config.num_timestep_buckets
+        self.config = config
+
+    def sample_time(self, batch_size, device, dtype):
+        sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype).clamp(max=self.config.noise_s)
+        return (self.config.noise_s - sample) / self.config.noise_s
+
+    def prepare_input(self, batch: dict) -> BatchFeature:
+        return BatchFeature(data=batch)
+
+
+    def forward(self, vl_embs: torch.Tensor, actions: torch.Tensor, state: torch.Tensor = None, encoder_attention_mask=None):
+        """
+        vl_embs: shape (B, seq_length, feature_dim)
+        actions: shape (B, future_action_window_size, D_action)
+        """
+        device = vl_embs.device
+
+        # Embed noised action trajectory.
+        noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype)
+        t = self.sample_time(actions.shape[0], device=actions.device, dtype=actions.dtype)
+        t = t[:, None, None]  # shape (B,1,1) for broadcast
+
+        noisy_trajectory = (1 - t) * noise + t * actions
+        velocity = actions - noise
+
+        # Convert (continuous) t -> discrete if needed
+        t_discretized = (t[:, 0, 0] * self.num_timestep_buckets).long()
+        action_features = self.action_encoder(noisy_trajectory, t_discretized)
+
+
+        # embed state
+        state_features = self.state_encoder(state) if state is not None else None
+
+
+        # Maybe add position embedding.
+        if self.config.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+
+        # state and action embedding along sequence dimension.
+        future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
+        sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1) \
+            if state_features is not None else torch.cat((future_tokens, action_features), dim=1)
+
+        # Join VLM features with state and action embedding along sequence dimension.
+        model_output = self.model(
+            hidden_states=sa_embs,
+            encoder_hidden_states=vl_embs,
+            encoder_attention_mask=encoder_attention_mask,
+            timestep=t_discretized,
+            return_all_hidden_states=False,  # NOTE (YL): not using flare now
+        )
+        pred = self.action_decoder(model_output)
+        pred_actions = pred[:, -actions.shape[1] :]
+
+        # Slice out only the action portion of pred and target.
+        loss = ((pred_actions - velocity) ** 2).mean()
+        return loss
+
+    @torch.no_grad()
+    def predict_action(
+        self,
+        vl_embs: torch.Tensor,
+        state: torch.Tensor = None,
+        encoder_attention_mask=None,
+    ) -> torch.Tensor:
+        # Set initial actions as the sampled noise.
+        batch_size = vl_embs.shape[0]
+        device = vl_embs.device
+        actions = torch.randn(
+            size=(batch_size, self.config.action_horizon, self.config.action_dim),
+            dtype=vl_embs.dtype,
+            device=device,
+        )
+
+        num_steps = self.num_inference_timesteps
+        dt = 1.0 / num_steps
+        
+        state_features = self.state_encoder(state) if state is not None else None
+
+        # Run denoising steps.
+        for t in range(num_steps):
+            t_cont = t / float(num_steps)  # e.g. goes 0, 1/N, 2/N, ...
+            t_discretized = int(t_cont * self.num_timestep_buckets)
+
+            # Embed noised action trajectory.
+            timesteps_tensor = torch.full(
+                size=(batch_size,), fill_value=t_discretized, device=device
+            )
+            action_features = self.action_encoder(actions, timesteps_tensor)
+            # Maybe add position embedding.
+            if self.config.add_pos_embed:
+                pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+                pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+                action_features = action_features + pos_embs
+
+            # Join vision, language, state and action embedding along sequence dimension.
+            future_tokens = self.future_tokens.weight.unsqueeze(0).expand(vl_embs.shape[0], -1, -1)
+            sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1) \
+                if state_features is not None else torch.cat((future_tokens, action_features), dim=1)
+
+            # Run model forward.
+            model_output = self.model(
+                hidden_states=sa_embs,
+                encoder_hidden_states=vl_embs,
+                encoder_attention_mask=encoder_attention_mask,
+                timestep=timesteps_tensor,
+            )
+            pred = self.action_decoder(model_output)
+
+            pred_velocity = pred[:, -self.action_horizon :]
+
+            # Update actions using euler integration.
+            actions = actions + dt * pred_velocity
+        return actions
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
+
+
+
+def get_action_model(config=None):
+    """
+    Factory: build FlowmatchingActionHead from global framework config.
+    
+    Args:
+        config: Global config (expects config.framework.action_model namespace).
+
+    Returns:
+        FlowmatchingActionHead: Initialized FlowMatchingActionHead.
+    """
+    return FlowmatchingActionHead(
+        full_config=config
+    )
+
+
+if __name__ == "__main__":
+    # TODO make each backbone.py can be debug independently
+
+    pass
diff --git a/code/model/modules/action_model/GR00T_ActionHeader_moh.py b/code/model/modules/action_model/GR00T_ActionHeader_moh.py
new file mode 100644
index 0000000000000000000000000000000000000000..f917c8b4ba78e42b4418115466d0ab016c7c4889
--- /dev/null
+++ b/code/model/modules/action_model/GR00T_ActionHeader_moh.py
@@ -0,0 +1,458 @@
+"""
+Mixture of Horizons (MoH) version of GROOT ActionHeader.
+
+This is a reference implementation showing how to integrate MoH strategy into GROOT.
+Key changes:
+1. Support multiple horizons (e.g., [5, 10, 15, 20, 50])
+2. Parallel processing via batching (batch_size * num_horizons)
+3. Gating network for ensemble
+4. Multi-component loss (individual + auxiliary + load balancing)
+"""
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from typing import Optional, List
+from dataclasses import dataclass, field
+
+from starVLA.model.modules.action_model.flow_matching_head.action_encoder import (
+    SinusoidalPositionalEncoding,
+    swish,
+)
+from starVLA.model.modules.action_model.flow_matching_head.cross_attention_dit import DiT
+from starVLA.model.modules.action_model.GR00T_ActionHeader import (
+    FlowmatchingActionHeadConfig,
+    ActionEncoder,
+    MLP,
+)
+
+
+class FlowmatchingActionHeadMoH(nn.Module):
+    """
+    GROOT ActionHeader with Mixture of Horizons support.
+    
+    Key differences from original:
+    - Supports multiple horizons (e.g., [5, 10, 15, 20, 50])
+    - Processes all horizons in parallel via batching
+    - Uses gating network to ensemble predictions
+    - Multi-component loss function
+    """
+    
+    def __init__(
+        self,
+        full_config,
+        horizons: List[int] = [2,5,8],  # Different horizon lengths
+        use_gate_noise: bool = True,  # Add learnable noise to gate logits
+    ):
+        super().__init__()
+        config = full_config.framework.action_model
+        self.horizons = sorted(horizons)  # Ensure sorted
+        self.max_horizon = self.horizons[-1]
+        self.num_horizons = len(self.horizons)
+        self.use_gate_noise = use_gate_noise
+        
+        self.hidden_size = config.hidden_size
+        self.full_config = full_config
+        action_model_type = config.action_model_type
+        action_model_cfg = {
+            "DiT-B": {"input_embedding_dim": 768, "attention_head_dim": 64, "num_attention_heads": 12},
+            "DiT-L": {"input_embedding_dim": 1536, "attention_head_dim": 48, "num_attention_heads": 32},
+        }[action_model_type]
+        
+        self.input_embedding_dim = action_model_cfg["input_embedding_dim"]
+        diffusion_model_cfg = config.diffusion_model_cfg
+        diffusion_model_cfg = {**action_model_cfg, **diffusion_model_cfg}
+        self.model = DiT(**diffusion_model_cfg)
+        self.action_dim = config.action_dim
+        self.action_horizon = config.future_action_window_size + 1
+        self.num_inference_timesteps = config.num_inference_timesteps
+
+        self.state_encoder = MLP(
+            input_dim=config.state_dim,
+            hidden_dim=self.hidden_size,
+            output_dim=self.input_embedding_dim,
+        ) if config.state_dim else None
+
+        self.action_encoder = ActionEncoder(
+            action_dim=config.action_dim,
+            hidden_size=self.input_embedding_dim,
+        )
+        self.action_decoder = MLP(
+            input_dim=self.model.config.output_dim,
+            hidden_dim=self.hidden_size,
+            output_dim=self.action_dim,
+        )
+        self.future_tokens = nn.Embedding(config.num_target_vision_tokens, self.input_embedding_dim)
+        nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02)
+
+        if config.add_pos_embed:
+            self.position_embedding = nn.Embedding(config.max_seq_len, self.input_embedding_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+
+        self.beta_dist = torch.distributions.Beta(config.noise_beta_alpha, config.noise_beta_beta)
+        self.num_timestep_buckets = config.num_timestep_buckets
+        self.config = config
+        
+        # MoH-specific components
+        # Gating network: predicts weights for each horizon at each timestep
+        # Input: model output features, Output: gate logits for each horizon
+        self.gate_out_proj = nn.Linear(self.model.config.output_dim, 1)
+        if self.use_gate_noise:
+            self.gate_noise_layer = nn.Linear(self.model.config.output_dim, 1)
+        self.softplus = nn.Softplus()
+        
+        print(f"MoH ActionHeader initialized with horizons: {self.horizons}")
+
+    def sample_time(self, batch_size, device, dtype):
+        sample = self.beta_dist.sample([batch_size]).to(device=device, dtype=dtype).clamp(max=self.config.noise_s)
+        return (self.config.noise_s - sample) / self.config.noise_s
+
+    def cv_squared(self, x):
+        """Coefficient of variation squared for load balancing."""
+        eps = 1e-10
+        if x.shape[0] == 1:
+            return torch.tensor(0.0, device=x.device, dtype=x.dtype)
+        return x.float().var() / (x.float().mean() ** 2 + eps)
+
+    def forward(
+        self, 
+        vl_embs: torch.Tensor, 
+        actions: torch.Tensor, 
+        state: torch.Tensor = None, 
+        encoder_attention_mask=None,
+        loss_config: dict = None
+    ):
+        """
+        Forward pass with MoH strategy.
+        
+        Args:
+            vl_embs: (B, seq_length, feature_dim) - Vision-language embeddings
+            actions: (B, max_horizon, D_action) - Ground truth actions (padded to max_horizon)
+            state: (B, state_dim) - Optional state features
+            encoder_attention_mask: Attention mask for encoder
+            loss_config: Dict with 'aux_weight' and 'balance_weight'
+        
+        Returns:
+            total_loss: Combined loss from all components
+        """
+        device = vl_embs.device
+        batch_size = actions.shape[0]
+        num_horizons = len(self.horizons)
+        max_horizon = self.max_horizon
+        
+        # Sample noise and time
+        noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype)
+        time_scalar = self.sample_time(batch_size, device, actions.dtype)
+        
+        # Expand time for each horizon: (num_h, batch_size)
+        time = time_scalar.unsqueeze(0).expand(num_horizons, -1)
+        # x_t: (num_h, batch_size, max_horizon, action_dim)
+        # Flow matching: x_t = (1-t) * noise + t * actions, where t=0 is noise and t=1 is actions
+        # Expand noise and actions to (num_h, batch_size, max_horizon, action_dim)
+        noise_expanded = noise.unsqueeze(0).expand(num_horizons, -1, -1, -1)
+        actions_expanded = actions.unsqueeze(0).expand(num_horizons, -1, -1, -1)
+        t_expanded = time[:, :, None, None]  # (num_h, batch_size, 1, 1)
+        x_t = (1 - t_expanded) * noise_expanded + t_expanded * actions_expanded
+        # u_t (target velocity): (batch_size, max_horizon, action_dim)
+        u_t = actions - noise
+        
+        # ============================================================
+        # STAGE 1: Prepare inputs for parallel processing
+        # ============================================================
+        # Repeat vl_embs and state for each horizon
+        batched_vl_embs = vl_embs.repeat_interleave(num_horizons, dim=0)  # (B*H, seq_len, dim)
+        batched_state = state.repeat_interleave(num_horizons, dim=0) if state is not None else None
+        if encoder_attention_mask is not None:
+            batched_encoder_attention_mask = encoder_attention_mask.repeat_interleave(num_horizons, dim=0)
+        else:
+            batched_encoder_attention_mask = None
+        
+        # Reshape x_t and time for batched processing
+        # x_t: (num_h, batch_size, max_horizon, dim) -> (batch_size * num_h, max_horizon, dim)
+        batched_x_t = x_t.permute(1, 0, 2, 3).reshape(batch_size * num_horizons, max_horizon, -1)
+        # time: (num_h, batch_size) -> (batch_size * num_h)
+        batched_time = time.permute(1, 0).reshape(batch_size * num_horizons)
+        
+        # Create padding masks for each horizon
+        # action_pad_mask: (num_h, max_horizon) - True where valid, False where padding
+        action_pad_mask = torch.arange(max_horizon, device=device)[None, :] < \
+                          torch.tensor(self.horizons, device=device)[:, None]
+        # Expand to batch: (num_h, batch_size, max_horizon)
+        action_pad_mask = action_pad_mask.unsqueeze(1).expand(-1, batch_size, -1)
+        # Reshape: (batch_size * num_h, max_horizon)
+        batched_action_pad_mask = action_pad_mask.permute(1, 0, 2).reshape(batch_size * num_horizons, max_horizon)
+        
+        # ============================================================
+        # STAGE 2: Forward pass through model (parallel for all horizons)
+        # ============================================================
+        # Convert time to discrete timesteps
+        t_discretized = (batched_time * self.num_timestep_buckets).long()
+        
+        # Encode actions
+        action_features = self.action_encoder(batched_x_t, t_discretized)
+        
+        # Add position embedding if needed
+        if self.config.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+        
+        # Prepare state and action embeddings
+        future_tokens = self.future_tokens.weight.unsqueeze(0).expand(
+            batch_size * num_horizons, -1, -1
+        )
+        if batched_state is not None:
+            state_features = self.state_encoder(batched_state)
+            sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1)
+        else:
+            sa_embs = torch.cat((future_tokens, action_features), dim=1)
+        
+        # Forward through DiT model
+        model_output = self.model(
+            hidden_states=sa_embs,
+            encoder_hidden_states=batched_vl_embs,
+            encoder_attention_mask=batched_encoder_attention_mask,
+            timestep=t_discretized,
+            return_all_hidden_states=False,
+        )
+        
+        # Decode actions
+        pred = self.action_decoder(model_output)
+        
+        # Extract action predictions (last max_horizon tokens)
+        # pred: (B*H, seq_len, action_dim) -> (B*H, max_horizon, action_dim)
+        state_offset = 1 if state is not None else 0
+        future_tokens_len = self.future_tokens.num_embeddings
+        action_start_idx = state_offset + future_tokens_len
+        pred_actions_padded = pred[:, action_start_idx:action_start_idx + max_horizon, :]
+        
+        # Reshape to separate predictions for each horizon
+        # (B*H, max_horizon, dim) -> (B, H, max_horizon, dim) -> (H, B, max_horizon, dim)
+        all_v_t_preds = pred_actions_padded.view(
+            batch_size, num_horizons, max_horizon, -1
+        ).permute(1, 0, 2, 3)
+        
+        # ============================================================
+        # STAGE 3: Compute losses
+        # ============================================================
+        # 1. Individual loss: Each horizon's prediction vs target
+        all_head_losses = []
+        for i, h in enumerate(self.horizons):
+            v_t_head = all_v_t_preds[i, :, :h, :]  # (B, h, dim)
+            target_v_t = u_t[:, :h, :]  # (B, h, dim)
+            head_loss = F.mse_loss(v_t_head, target_v_t)
+            all_head_losses.append(head_loss)
+        individual_loss = torch.sum(torch.stack(all_head_losses))
+        
+        # 2. Gating network: Generate weights for ensemble
+        gate_logits = self.gate_out_proj(model_output.to(torch.float32))
+        gate_logits = gate_logits[:, action_start_idx:action_start_idx + max_horizon, :]  # (B*H, max_horizon, 1)
+        
+        if self.use_gate_noise:
+            # Add learnable noise to gate logits
+            noise_epsilon = 1e-2
+            raw_noise_stddev = self.gate_noise_layer(model_output.to(torch.float32))
+            raw_noise_stddev = raw_noise_stddev[:, action_start_idx:action_start_idx + max_horizon, :]
+            noise_stddev = self.softplus(raw_noise_stddev) + noise_epsilon
+            gate_logits = gate_logits + (torch.randn_like(gate_logits) * noise_stddev)
+        
+        # Reshape gate logits: (B*H, max_horizon, 1) -> (B, H, max_horizon) -> (B, max_horizon, H)
+        gate_logits = gate_logits.reshape(batch_size, num_horizons, max_horizon).permute(0, 2, 1)
+        
+        # Apply mask: invalid horizons (where step >= horizon) get -inf
+        valid_heads_mask = torch.tensor(
+            [[step < h for h in self.horizons] for step in range(max_horizon)],
+            device=device, dtype=torch.bool
+        ).unsqueeze(0)  # (1, max_horizon, H)
+        
+        masked_gate_logits = torch.where(
+            valid_heads_mask, 
+            gate_logits, 
+            torch.finfo(gate_logits.dtype).min
+        )
+        gate_weights = F.softmax(masked_gate_logits, dim=-1)  # (B, max_horizon, H)
+        
+        # 3. Ensemble predictions using gate weights
+        # all_v_t_preds: (H, B, max_horizon, dim) -> (B, H, max_horizon, dim)
+        all_v_t_preds_padded = all_v_t_preds.permute(1, 0, 2, 3)
+        # gate_weights: (B, max_horizon, H) -> (B, H, max_horizon, 1)
+        # combined: (B, max_horizon, dim)
+        v_t_combined = (gate_weights.permute(0, 2, 1).unsqueeze(-1) * all_v_t_preds_padded).sum(dim=1)
+        
+        # Auxiliary loss: Ensemble prediction vs target
+        aux_loss_weight = loss_config.get("aux_weight", 1.0) if loss_config else 1.0
+        auxiliary_loss = F.mse_loss(v_t_combined, u_t)
+        
+        # 4. Load balancing loss: Encourage balanced usage of horizons
+        loss_components = []
+        boundaries = sorted(list(set([0] + self.horizons)))
+        for i in range(len(boundaries) - 1):
+            start_step, end_step = boundaries[i], boundaries[i + 1]
+            active_expert_indices = [idx for idx, h in enumerate(self.horizons) if h > start_step]
+            
+            if len(active_expert_indices) > 1:
+                segment_gate_weights = gate_weights[:, start_step:end_step, :]
+                active_expert_weights = segment_gate_weights[:, :, active_expert_indices]
+                avg_expert_prob_in_segment = active_expert_weights.mean(dim=(0, 1))
+                segment_loss = self.cv_squared(avg_expert_prob_in_segment)
+                loss_components.append(segment_loss)
+        
+        load_balancing_loss = torch.mean(torch.stack(loss_components)) if loss_components else torch.tensor(0.0, device=device)
+        balance_loss_weight = loss_config.get("balance_weight", 0.001) if loss_config else 0.001
+        
+        # Total loss
+        total_loss = individual_loss + aux_loss_weight * auxiliary_loss + balance_loss_weight * load_balancing_loss
+        
+        return total_loss
+
+    @torch.no_grad()
+    def predict_action(
+        self, 
+        vl_embs: torch.Tensor, 
+        state: torch.Tensor = None,
+        ret_weights: bool = False
+    ) -> dict:
+        """
+        Inference with MoH ensemble.
+        
+        Args:
+            vl_embs: (B, seq_length, feature_dim)
+            state: (B, state_dim) - Optional
+            ret_weights: Whether to return gate weights
+        
+        Returns:
+            dict with 'actions' and optionally 'gate_weights'
+        """
+        batch_size = vl_embs.shape[0]
+        device = vl_embs.device
+        num_horizons = len(self.horizons)
+        max_horizon = self.max_horizon
+        
+        # Initialize actions as noise
+        actions = torch.randn(
+            size=(batch_size, max_horizon, self.action_dim),
+            dtype=vl_embs.dtype,
+            device=device,
+        )
+        
+        num_steps = self.num_inference_timesteps
+        dt = 1.0 / num_steps
+        
+        gate_weights_to_log = []
+        
+        # Prepare batched inputs (same for all denoising steps)
+        batched_vl_embs = vl_embs.repeat_interleave(num_horizons, dim=0)
+        batched_state = state.repeat_interleave(num_horizons, dim=0) if state is not None else None
+        
+        # Denoising loop
+        for t in range(num_steps):
+            t_cont = t / float(num_steps)
+            t_discretized = int(t_cont * self.num_timestep_buckets)
+            timesteps_tensor = torch.full(
+                size=(batch_size * num_horizons,), 
+                fill_value=t_discretized, 
+                device=device
+            )
+            
+            # Prepare padded actions for each horizon
+            padded_x_t_list, action_pad_mask_list = [], []
+            for h in self.horizons:
+                padded_x_t = F.pad(actions[:, :h, :], (0, 0, 0, max_horizon - h))
+                padded_x_t_list.append(padded_x_t)
+                pad_mask = F.pad(
+                    torch.ones((batch_size, h), device=device, dtype=torch.bool),
+                    (0, max_horizon - h),
+                    value=False
+                )
+                action_pad_mask_list.append(pad_mask)
+            
+            batched_x_t = torch.cat(padded_x_t_list, dim=0)
+            action_pad_mask = torch.cat(action_pad_mask_list, dim=0)
+            
+            # Encode actions
+            action_features = self.action_encoder(batched_x_t, timesteps_tensor)
+            
+            if self.config.add_pos_embed:
+                pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+                pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+                action_features = action_features + pos_embs
+            
+            # Prepare embeddings
+            future_tokens = self.future_tokens.weight.unsqueeze(0).expand(
+                batch_size * num_horizons, -1, -1
+            )
+            if batched_state is not None:
+                state_features = self.state_encoder(batched_state)
+                sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1)
+            else:
+                sa_embs = torch.cat((future_tokens, action_features), dim=1)
+            
+            # Forward through model
+            model_output = self.model(
+                hidden_states=sa_embs,
+                encoder_hidden_states=batched_vl_embs,
+                timestep=timesteps_tensor,
+            )
+            
+            # Decode actions
+            pred = self.action_decoder(model_output)
+            
+            # Extract action predictions
+            state_offset = 1 if state is not None else 0
+            future_tokens_len = self.future_tokens.num_embeddings
+            action_start_idx = state_offset + future_tokens_len
+            pred_actions_padded = pred[:, action_start_idx:action_start_idx + max_horizon, :]
+            
+            # Reshape: (B*H, max_horizon, dim) -> (B, H, max_horizon, dim)
+            all_v_t_preds_padded = pred_actions_padded.view(
+                num_horizons, batch_size, max_horizon, -1
+            ).permute(1, 0, 2, 3)
+            
+            # Gating network
+            gate_logits = self.gate_out_proj(model_output.to(torch.float32))
+            gate_logits = gate_logits[:, action_start_idx:action_start_idx + max_horizon, :]
+            gate_logits = gate_logits.reshape(batch_size, num_horizons, max_horizon).permute(0, 2, 1)
+            
+            valid_heads_mask = torch.tensor(
+                [[step < h for h in self.horizons] for step in range(max_horizon)],
+                device=device, dtype=torch.bool
+            ).unsqueeze(0)
+            
+            masked_gate_logits = torch.where(
+                valid_heads_mask,
+                gate_logits,
+                torch.finfo(gate_logits.dtype).min
+            )
+            gate_weights = F.softmax(masked_gate_logits, dim=-1)
+            
+            if ret_weights:
+                gate_weights_to_log.append(torch.round(gate_weights, decimals=3))
+            
+            # Ensemble predictions
+            v_t = (gate_weights.permute(0, 2, 1).unsqueeze(-1) * all_v_t_preds_padded).sum(dim=1)
+            
+            # Euler update
+            actions = actions + dt * v_t
+        
+        return_dict = {"actions": actions}
+        if ret_weights and len(gate_weights_to_log) > 0:
+            return_dict["gate_weights"] = torch.stack(gate_weights_to_log, dim=1).detach().cpu()
+        
+        return return_dict["actions"]
+
+
+def get_action_model(config=None, horizons: List[int] = [2,5,8]):
+    """
+    Factory: build FlowmatchingActionHeadMoH from global framework config.
+    
+    Args:
+        config: Global config (expects config.framework.action_model namespace).
+        horizons: List of horizon lengths to use for MoH
+    
+    Returns:
+        FlowmatchingActionHeadMoH: Initialized MoH ActionHeader.
+    """
+    return FlowmatchingActionHeadMoH(
+        full_config=config,
+        horizons=horizons,
+    )
diff --git a/code/model/modules/action_model/LayerwiseFM_ActionHeader.py b/code/model/modules/action_model/LayerwiseFM_ActionHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e5a7cb538f2d1f325e0be7ee1970859e62fde3
--- /dev/null
+++ b/code/model/modules/action_model/LayerwiseFM_ActionHeader.py
@@ -0,0 +1,427 @@
+# Copyright 2025 NVIDIA Corp. and affiliates. All rights reserved.
+# Modified by [Junqiu YU/ Fudan University] in [2025]. 
+# Modification: [rm and add some connect adapter to match with starVLA, e.g., "rm "].
+
+
+
+from dataclasses import dataclass, field
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.distributions import Beta
+from transformers import PretrainedConfig
+from transformers.feature_extraction_utils import BatchFeature
+
+from starVLA.model.modules.action_model.flow_matching_head.action_encoder import (
+    SinusoidalPositionalEncoding,
+    swish,
+)
+
+from starVLA.model.modules.action_model.flow_matching_head.cross_attention_dit import DiT, SelfAttentionTransformer
+
+# TODO try to meger DiT Modules with follow_match_head, they are just the same arch, but diff loss, use diffusers package will be simple
+
+class CategorySpecificLinear(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        # For each category, we have separate weights and biases.
+        self.W = nn.Parameter(0.02 * torch.randn(num_categories, input_dim, hidden_dim))
+        self.b = nn.Parameter(torch.zeros(num_categories, hidden_dim))
+
+    def forward(self, x, cat_ids):
+        selected_W = self.W[cat_ids]
+        selected_b = self.b[cat_ids]
+        # import ipdb; ipdb.set_trace()
+        return torch.bmm(x, selected_W) + selected_b.unsqueeze(1)
+
+
+class CategorySpecificMLP(nn.Module):
+    def __init__(self, num_categories, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.num_categories = num_categories
+        self.layer1 = CategorySpecificLinear(num_categories, input_dim, hidden_dim)
+        self.layer2 = CategorySpecificLinear(num_categories, hidden_dim, output_dim)
+
+    def forward(self, x, cat_ids):
+        hidden = F.relu(self.layer1(x, cat_ids))
+        return self.layer2(hidden, cat_ids)
+
+
+
+class MLP(nn.Module):
+    def __init__(self, input_dim, hidden_dim=1024, output_dim=2048):
+        super().__init__()
+        self.layer1 = nn.Linear(input_dim, hidden_dim)
+        self.layer2 = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        return self.layer2(F.relu(self.layer1(x)))
+
+
+class ActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size=1024):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.action_dim = action_dim
+        self.layer1 = nn.Linear(action_dim, hidden_size)
+        self.layer2 = nn.Linear(2 * hidden_size, hidden_size)
+        self.layer3 = nn.Linear(hidden_size, hidden_size)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.layer1(actions)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then layer2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.layer2(x))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.layer3(x)
+        return x
+
+
+
+class MultiEmbodimentActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size=1024, num_embodiments=8):
+        super().__init__()
+        self.hidden_size = hidden_size
+        self.num_embodiments = num_embodiments
+
+        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
+        self.W1 = CategorySpecificLinear(num_embodiments, action_dim, hidden_size)  # (d -> w)
+        self.W2 = CategorySpecificLinear(num_embodiments, 2 * hidden_size, hidden_size)  # (2w -> w)
+        self.W3 = CategorySpecificLinear(num_embodiments, hidden_size, hidden_size)  # (w -> w)
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps, cat_ids):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        cat_ids:   shape (B,)
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.W1(actions, cat_ids)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.W2(x, cat_ids))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.W3(x, cat_ids)
+        return x
+
+
+@dataclass
+class FlowmatchingActionHeadConfig(PretrainedConfig):
+    """NOTE: N1.5 uses XEmbFlowmatchingPolicyHeadConfig as action head"""
+
+    add_pos_embed: bool = field(
+        default=True, metadata={"help": "Whether to add positional embedding"}
+    )
+    diffusion_model_cfg: dict = field(
+        default=None, metadata={"help": "Diffusion model configuration."}
+    )
+    input_embedding_dim: int = field(
+        default=1536, metadata={"help": "Input embedding channel dimension."}
+    )
+
+    hidden_size: int = field(default=1024, metadata={"help": "Input embedding dimension."})
+    max_seq_len: int = field(default=1024, metadata={"help": "Maxium Sequence Length"})
+    action_dim: int = field(default=None, metadata={"help": "Action dimension."})
+    action_horizon: int = field(default=None, metadata={"help": "Action horizon."})
+    noise_beta_alpha: float = field(default=1.5, metadata={"help": ""})
+    noise_beta_beta: float = field(default=1.0, metadata={"help": ""})
+    noise_s: float = field(
+        default=0.999, metadata={"help": "Flow matching noise Beta distribution s."}
+    )
+    num_timestep_buckets: int = field(
+        default=1000, metadata={"help": "Number of timestep discretization buckets."}
+    )
+    num_inference_timesteps: int = field(
+        default=None,
+        metadata={"help": "Number of inference steps for noise diffusion."},
+    )
+    max_num_embodiments: int = field(default=32, metadata={"help": "Number of embodiments."})
+    tune_projector: bool = field(default=True, metadata={"help": "Whether to tune the projector."})
+    tune_diffusion_model: bool = field(
+        default=True, metadata={"help": "Whether to tune the diffusion model."}
+    )
+    load_pretrained_det_decode_layer_path: str = field(
+        default=None, metadata={"help": "Path to pretrained detection model."}
+    )
+    detection_coeff: float = field(default=1.0, metadata={"help": "Detection coefficient."})
+
+    freeze_decode_layer: bool = field(default=False)
+    expand_batch: int = field(default=None)
+    use_vlln: bool = field(default=True)
+
+    vl_self_attention_cfg: dict = field(default=None)
+    num_target_vision_tokens: int = field(
+        default=32, metadata={"help": "Number of target vision tokens."}
+    )
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+
+
+
+DiTConfig = {"num_layers": 36, "input_embedding_dim": 2048, "attention_head_dim": 64, "num_attention_heads": 32} # default for qwen2.5-vl
+
+
+class LayerwiseFlowmatchingActionHead(nn.Module):
+    def __init__(
+        self,
+        global_config,
+        **kwargs,
+    ):
+        super().__init__()
+        action_config = global_config.framework.action_model
+        diffusion_model_cfg = action_config.diffusion_model_cfg
+
+        # 更新 DiTConfig 到 diffusion_model_cfg
+        DiTConfig["num_layers"] = global_config.framework.qwenvl.num_vl_layers
+        DiTConfig["input_embedding_dim"] = global_config.framework.qwenvl.vl_hidden_dim
+        DiTConfig["num_attention_heads"] = DiTConfig["input_embedding_dim"] // DiTConfig["attention_head_dim"]
+        diffusion_model_cfg.update(DiTConfig)
+        # diffusion_model_cfg["interleave_self_attention"] = False
+        diffusion_model_cfg.cross_attention_dim = DiTConfig["input_embedding_dim"] # should match vl embedding dim, but for some case we might want to change it for cross + self attention
+        self.input_embedding_dim = global_config.framework.qwenvl.vl_hidden_dim
+        self.model = DiT(**diffusion_model_cfg) # TODO better way is copy LLM from VLM
+        self.dit_out_hidden_size = self.input_embedding_dim
+        self.action_dim = action_config.action_dim
+        self.action_horizon = action_config.future_action_window_size + 1
+        self.num_inference_timesteps = action_config.num_inference_timesteps
+
+        self.state_encoder = MLP(
+            input_dim=action_config.state_dim,
+            output_dim=self.input_embedding_dim,
+        ) if action_config.state_dim else None
+
+        self.action_encoder = ActionEncoder(
+            action_dim=action_config.action_dim,
+            hidden_size=self.input_embedding_dim,
+        )
+        self.action_decoder = MLP(
+            input_dim=self.input_embedding_dim,
+            hidden_dim=1024,
+            output_dim=self.action_dim,
+        )
+        self.future_tokens = nn.Embedding(action_config.num_target_vision_tokens, self.input_embedding_dim)
+        nn.init.normal_(self.future_tokens.weight, mean=0.0, std=0.02)
+
+        if action_config.add_pos_embed:
+            self.position_embedding = nn.Embedding(action_config.max_seq_len, self.input_embedding_dim)
+            nn.init.normal_(self.position_embedding.weight, mean=0.0, std=0.02)
+
+        self.beta_dist = Beta(action_config.noise_beta_alpha, action_config.noise_beta_beta)
+        self.num_timestep_buckets = action_config.num_timestep_buckets
+        self.config = action_config
+
+    def sample_time(self, batch_size, device, dtype):
+        sample = self.beta_dist.sample([batch_size]).to(device, dtype=dtype)
+        return (self.config.noise_s - sample) / self.config.noise_s
+
+    def prepare_input(self, batch: dict) -> BatchFeature:
+        return BatchFeature(data=batch)
+
+
+    def forward(self, vl_embs_list: list, actions: torch.Tensor, state: torch.Tensor = None, encoder_attention_mask: torch.Tensor = None):
+        """
+        vl_embs: list of torch.Tensor, each shape (B, seq_length, feature_dim)
+        actions: shape (B, future_action_window_size, D_action)
+        encoder_attention_mask: optional (B, seq_length) mask for VLM padding tokens
+        """
+        device = actions.device
+        num_layers = len(vl_embs_list)
+        B, L, D = vl_embs_list[0].shape
+        # Embed noised action trajectory.
+        noise = torch.randn(actions.shape, device=actions.device, dtype=actions.dtype)
+        t = self.sample_time(actions.shape[0], device=actions.device, dtype=actions.dtype)
+        t = t[:, None, None]  # shape (B,1,1) for broadcast
+
+        noisy_trajectory = (1 - t) * noise + t * actions
+        velocity = actions - noise
+
+        # Convert (continuous) t -> discrete if needed
+        t_discretized = (t[:, 0, 0] * self.num_timestep_buckets).long()
+        action_features = self.action_encoder(noisy_trajectory, t_discretized)
+
+        # Embed state
+        state_features = self.state_encoder(state) if state is not None else None
+
+        # Maybe add position embedding.
+        if self.config.add_pos_embed:
+            pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+            pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+            action_features = action_features + pos_embs
+
+        # state and action embedding along sequence dimension.
+        future_tokens = self.future_tokens.weight.unsqueeze(0).expand(B, -1, -1)
+        sa_embs = torch.cat((state_features, future_tokens, action_features), dim=1) \
+            if state_features is not None else torch.cat((future_tokens, action_features), dim=1)
+        
+        # Encode timesteps
+        temb = self.model.timestep_encoder(t_discretized)
+
+        # Convert encoder_attention_mask from long int (0/1) to bool so that
+        # F.scaled_dot_product_attention accepts it (requires bool or float, not long).
+        # Qwen attention mask: 1 = valid token (attend), 0 = padding (mask out).
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask.bool()
+
+        # Layerwise cross-attention with vl_embs
+        model_output = sa_embs
+        for layer_idx, layer in enumerate(self.model.transformer_blocks):
+            model_output = layer(
+                hidden_states=model_output,
+                encoder_hidden_states=vl_embs_list[layer_idx],
+                temb=temb,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        
+        # TODO miss self att and _process_output, but work well
+        pred = self.action_decoder(model_output)
+        pred_actions = pred[:, -actions.shape[1] :]
+
+        # Slice out only the action portion of pred and target.
+        loss = ((pred_actions - velocity) ** 2).mean()
+        return loss
+
+    @torch.no_grad()
+    def predict_action(self, vl_embs_list: list, state: torch.Tensor = None, encoder_attention_mask: torch.Tensor = None) -> torch.Tensor:
+        # Set initial actions as the sampled noise.
+        batch_size = vl_embs_list[0].shape[0]
+        device = vl_embs_list[0].device
+        actions = torch.randn(
+            size=(batch_size, self.action_horizon, self.action_dim),
+            dtype=vl_embs_list[0].dtype,
+            device=device,
+        )
+
+        num_steps = self.num_inference_timesteps
+        dt = 1.0 / num_steps
+
+        state_features = self.state_encoder(state) if state is not None else None
+
+        # Convert encoder_attention_mask dtype once before the denoising loop.
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask.bool()
+
+        # Run denoising steps.
+        for t in range(num_steps):
+            t_cont = t / float(num_steps)
+            t_discretized_int = int(t_cont * self.num_timestep_buckets)
+            timesteps_tensor = torch.full(
+                size=(batch_size,), fill_value=t_discretized_int, device=device, dtype=torch.long
+            )
+
+            # Embed current action trajectory with timestep
+            action_features = self.action_encoder(actions, timesteps_tensor)
+
+            # Maybe add position embedding.
+            if self.config.add_pos_embed:
+                pos_ids = torch.arange(action_features.shape[1], dtype=torch.long, device=device)
+                pos_embs = self.position_embedding(pos_ids).unsqueeze(0)
+                action_features = action_features + pos_embs
+
+            future_tokens = self.future_tokens.weight.unsqueeze(0).expand(batch_size, -1, -1)
+            sa_embs = (
+                torch.cat((state_features, future_tokens, action_features), dim=1)
+                if state_features is not None
+                else torch.cat((future_tokens, action_features), dim=1)
+            )
+
+            # Encode timestep
+            temb = self.model.timestep_encoder(timesteps_tensor)
+
+            # Layerwise cross-attention with vl_embs_list
+            model_output = sa_embs
+            for layer_idx, layer in enumerate(self.model.transformer_blocks):
+                model_output = layer(
+                    hidden_states=model_output,
+                    encoder_hidden_states=vl_embs_list[layer_idx],
+                    temb=temb,
+                    encoder_attention_mask=encoder_attention_mask,
+                )
+            # TODO miss self att and _process_output 
+            pred = self.action_decoder(model_output)
+            pred_velocity = pred[:, -self.action_horizon :]
+
+            # Euler integration
+            actions = actions + dt * pred_velocity
+        return actions
+
+    @property
+    def device(self):
+        return next(iter(self.parameters())).device
+
+    @property
+    def dtype(self):
+        return next(iter(self.parameters())).dtype
+
+
+
+def get_action_model(config=None):
+    """
+    Factory: build FlowmatchingActionHead from global framework config.
+    
+    Args:
+        config: Global config (expects config.framework.action_model namespace).
+
+    Returns:
+        FlowmatchingActionHead: Initialized FlowMatchingActionHead.
+    """
+    return LayerwiseFlowmatchingActionHead(
+        global_config=config
+    )
+
+
+
+if __name__ == "__main__":
+    # TODO make each backbone.py can be debug independently
+
+    pass
\ No newline at end of file
diff --git a/code/model/modules/action_model/MLP_ActionHeader.py b/code/model/modules/action_model/MLP_ActionHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cef440ff2aa82be16d767c437b8c9bd888c2669
--- /dev/null
+++ b/code/model/modules/action_model/MLP_ActionHeader.py
@@ -0,0 +1,113 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+"""Implementations of various action heads, which serve as alternatives to VLM sequential token prediction."""
+"this file is adap from https://github.com/moojink/openvla-oft/blob/main/prismatic/models/action_heads.py"
+
+import torch.nn as nn
+
+
+class MLPResNetBlock(nn.Module):
+    """One MLP ResNet block with a residual connection."""
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        self.ffn = nn.Sequential(  # feedforward network, similar to the ones in Transformers
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim),
+            nn.ReLU(),
+        )
+
+    def forward(self, x):
+        # x: (batch_size, hidden_dim)
+        # We follow the module ordering of "Pre-Layer Normalization" feedforward networks in Transformers as
+        # described here: https://arxiv.org/pdf/2002.04745.pdf
+        identity = x
+        x = self.ffn(x)
+        x = x + identity
+        return x
+
+
+class MLPResNet(nn.Module):
+    """MLP with residual connection blocks."""
+    def __init__(self, num_blocks, input_dim, hidden_dim, output_dim):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(input_dim)
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.relu = nn.ReLU()
+        self.mlp_resnet_blocks = nn.ModuleList()
+        for _ in range(num_blocks):
+            self.mlp_resnet_blocks.append(MLPResNetBlock(dim=hidden_dim))
+        self.layer_norm2 = nn.LayerNorm(hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+
+    def forward(self, x):
+        # x: (batch_size, input_dim)
+        x = self.layer_norm1(x)  # shape: (batch_size, input_dim)
+        x = self.fc1(x)  # shape: (batch_size, hidden_dim)
+        x = self.relu(x)  # shape: (batch_size, hidden_dim)
+        for block in self.mlp_resnet_blocks:
+            x = block(x)  # shape: (batch_size, hidden_dim)
+        x = self.layer_norm2(x)  # shape: (batch_size, hidden_dim)
+        x = self.fc2(x)  # shape: (batch_size, output_dim)
+        return x
+
+
+class L1RegressionActionHead(nn.Module):
+    """Simple MLP-based action head that generates continuous actions via L1 regression."""
+    def __init__(
+        self,
+        input_dim=2048,
+        hidden_dim=4096,
+        action_dim=7,
+        NUM_ACTIONS_CHUNK=8,
+    ):
+        super().__init__()
+        self.action_dim = action_dim
+        self.NUM_ACTIONS_CHUNK = NUM_ACTIONS_CHUNK
+
+        
+        self.model = MLPResNet(
+            num_blocks=2, input_dim=input_dim , hidden_dim=hidden_dim, output_dim=action_dim
+        )
+
+    def predict_action(self, actions_hidden_states):
+        """
+        actions_hidden_states: (B, chunk_len, hidden_dim)
+        返回: (B, chunk_len, action_dim)
+        """
+        batch_size, chunk_len, hidden_dim = actions_hidden_states.shape
+        x = actions_hidden_states.reshape(batch_size * chunk_len, hidden_dim)
+        x = self.model(x)  # (B * chunk_len, action_dim)
+        actions = x.view(batch_size, chunk_len, self.action_dim)
+        return actions
+
+    def forward(self, actions_hidden_states):
+        return self.predict_action(actions_hidden_states)
+
+
+def get_action_model(config=None):
+    """
+    Factory: build ActionModel from global framework config.
+
+    Args:
+        config: Global config (expects config.framework.action_model namespace).
+    Returns:
+        ActionModel: Initialized diffusion action head.
+    """
+    action_model_cfg = config.framework.action_model
+    model_type = action_model_cfg.action_model_type
+    action_hidden_dim = action_model_cfg.action_hidden_dim
+    action_dim = action_model_cfg.action_dim
+    future_action_window_size = action_model_cfg.future_action_window_size
+    past_action_window_size = action_model_cfg.past_action_window_size
+
+    action_model = L1RegressionActionHead(
+        input_dim=action_hidden_dim,
+        hidden_dim=action_hidden_dim*2,
+        action_dim=action_dim,
+        NUM_ACTIONS_CHUNK=past_action_window_size+1+future_action_window_size,
+    )
+
+    return action_model
diff --git a/code/model/modules/action_model/VLA_AdapterHeader.py b/code/model/modules/action_model/VLA_AdapterHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a6d08260ec3ac217363eb67e432b2bb933dd762
--- /dev/null
+++ b/code/model/modules/action_model/VLA_AdapterHeader.py
@@ -0,0 +1,395 @@
+"""
+action_heads.py
+
+Implementations of various action heads, which serve as alternatives to VLM sequential token prediction.
+"""
+
+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class VLA_Adapter_L1RegressionActionHead(nn.Module):
+    """Simple MLP-based action head that generates continuous actions via L1 regression."""
+    def __init__(
+        self,
+        full_config,
+    ):
+        super().__init__()
+        self.config = full_config
+
+        input_dim = full_config.framework.qwenvl.vl_hidden_dim
+        hidden_dim = full_config.framework.action_model.hidden_dim
+        action_dim = full_config.framework.action_model.action_dim
+        
+        self.action_query_num = full_config.framework.action_model.get("action_query_num", 64)
+        use_pro_version = full_config.framework.action_model.use_pro_version
+
+        self.action_dim = action_dim
+        self.hidden_dim = hidden_dim
+
+        self.num_actions_chunk = self.config.framework.action_model.get("num_actions_chunk", None)
+        if self.num_actions_chunk is None:
+            raise ValueError("num_actions_chunk must be specified in action_model config.")
+        
+        # Learnable action chunk embeddings (like positional embeddings)
+        # Applied during both training and inference
+        self.action_chunk_embeddings = nn.Parameter(
+            torch.zeros(self.num_actions_chunk, action_dim * hidden_dim)
+        )
+        nn.init.normal_(self.action_chunk_embeddings, mean=0.0, std=0.02)
+        
+        self.model = MLPResNet(
+            num_blocks=24, 
+            input_dim=input_dim*action_dim, 
+            hidden_dim=hidden_dim, 
+            output_dim=action_dim,
+            use_pro_version=use_pro_version
+            )
+ 
+
+    def predict_action(
+            self, 
+            actions_hidden_states, 
+            vision_hidden_len: int,
+            state_projected=None,
+            phase="Inference"
+            ):
+        """
+        Args:
+            actions_hidden_states: [B, Layers, Total_Len, D]
+            
+            根据 Qwen_Adapter 的逻辑，Total_Len = (Vision_Len + Action_Query_Num)。
+            Language Tokens 已经在 Adapter 阶段被过滤掉了，所以这里不需要额外处理 Language。
+        """
+        batch_size = actions_hidden_states.shape[0]
+        device = actions_hidden_states.device
+
+        # 1. Proprioception Processing
+        if state_projected is not None:
+            proprio_features = state_projected.unsqueeze(dim=1)  # (bsz, 1, llm_dim)
+        else:
+            proprio_features = None
+        
+        # Action Query Tokens (h_a)
+        action_query_states = actions_hidden_states[:, :, -self.action_query_num:, :]
+        
+        task_hidden_states = actions_hidden_states[:, :, :-self.action_query_num, :]
+        assert vision_hidden_len == task_hidden_states.shape[2], "Vision hidden length mismatch"
+
+        # 3. Action Chunk Queries Init
+        cond_actions_hidden_states = torch.zeros(
+            (batch_size, self.action_dim * self.num_actions_chunk, self.hidden_dim),
+            device=device, dtype=actions_hidden_states.dtype
+        ).detach()  
+
+        rearranged_actions_hidden_states = cond_actions_hidden_states.reshape(
+            batch_size, self.num_actions_chunk, -1
+        ) 
+
+        # Add learnable action chunk embeddings (applied during both training and inference)
+        embeddings = self.action_chunk_embeddings.unsqueeze(0).expand(batch_size, -1, -1)
+        rearranged_actions_hidden_states = rearranged_actions_hidden_states + embeddings
+
+        # 4. MLP Forward
+        action = self.model(
+            rearranged_actions_hidden_states,
+            h_a=action_query_states,  # [B, Layers, query_num, D]
+            p=proprio_features,       # [B, 1, D]
+            h_t=task_hidden_states    # [B, Layers, vis_len, D]
+            )
+        
+        # Assert shape 
+        assert action.shape == (batch_size, self.num_actions_chunk, self.action_dim), "Action shape mismatch"
+        return action
+    
+
+class MLPResNet(nn.Module):
+    """MLP with residual connection blocks."""
+    def __init__(
+            self, 
+            num_blocks, 
+            input_dim, 
+            hidden_dim, 
+            output_dim,
+            use_pro_version=False
+            ):
+        
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(input_dim)
+        self.fc1 = nn.Linear(input_dim, hidden_dim)
+        self.relu = nn.ReLU()
+        self.mlp_resnet_blocks = nn.ModuleList()
+
+        for _ in range(num_blocks):
+            if use_pro_version:
+                self.mlp_resnet_blocks.append(MLPResNetBlock_Pro(dim=hidden_dim))
+            else:
+                self.mlp_resnet_blocks.append(MLPResNetBlock(dim=hidden_dim))
+                
+        self.layer_norm2 = nn.LayerNorm(hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, output_dim)
+
+
+    def forward(self, x, h_a=None, h_t=None, p=None):
+        # x: (batch_size, input_dim)
+        x = self.layer_norm1(x)
+        x = self.fc1(x) 
+        x = self.relu(x)
+        
+        for i, block in enumerate(self.mlp_resnet_blocks):
+            idx = i + 1
+            
+            cur_h_t = None
+            if h_t is not None and h_t.shape[1] > idx:
+                cur_h_t = h_t[:, idx, :] 
+
+            cur_h_a = None
+            if h_a is not None and h_a.shape[1] > idx:
+                cur_h_a = h_a[:, idx, :]
+            
+            x = block(x, h_t=cur_h_t, h_a=cur_h_a, p=p)
+            
+        x = self.layer_norm2(x)
+        x = self.fc2(x)
+        return x   
+
+
+def apply_rope(q, k, cos, sin):
+    cos = cos.unsqueeze(0).unsqueeze(0)
+    sin = sin.unsqueeze(0).unsqueeze(0)
+
+    def rotate_half(x):
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        return torch.stack((-x2, x1), dim=-1).reshape_as(x)
+
+    q_rot = (q * cos) + (rotate_half(q) * sin)
+    k_rot = (k * cos) + (rotate_half(k) * sin)
+
+    return q_rot, k_rot
+
+
+class RotaryPositionEmbedding(nn.Module):
+    def __init__(self, dim, base=10000):
+        super().__init__()
+        assert dim % 2 == 0, "RoPE head_dim must be an even number"
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+    def forward(self, seq_len, device, dtype):
+        t = torch.arange(seq_len, device=device, dtype=self.inv_freq.dtype)
+        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        emb = torch.cat([freqs, freqs], dim=-1)
+        return emb.cos().to(dtype), emb.sin().to(dtype)
+
+
+class MLPResNetBlock(nn.Module):
+    """
+    Standard MLP ResNet Block.
+    """
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+        self.ffn = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim),
+            nn.ReLU(),
+        )
+        self.num_heads = 8
+        self.head_dim = dim // self.num_heads
+        self.q_proj = nn.Linear(dim, dim)
+        self.k_proj = nn.Linear(dim, dim)
+        self.v_proj = nn.Linear(dim, dim)
+        self.o_proj = nn.Linear(dim, dim)
+        self.gating_factor = nn.Parameter(torch.zeros(1))
+
+    def forward(self, x, h_t=None, h_a=None, p=None):
+        g = self.gating_factor
+        ratio_g = torch.tanh(g)
+
+        conditions = []
+        if h_a is not None:
+            if h_a.dim() == 2: h_a = h_a.unsqueeze(1)
+            conditions.append(h_a)
+        if p is not None:
+            if p.dim() == 2: p = p.unsqueeze(1)
+            conditions.append(p)
+            
+        h_cond = torch.cat(conditions, dim=1) if len(conditions) > 0 else None
+
+        if h_t is not None:
+            if h_t.dim() == 2: h_t = h_t.unsqueeze(1)
+        
+        B, T, C = x.shape
+        K_cond = h_cond.size(1) if h_cond is not None else 0
+        K_task = h_t.size(1) if h_t is not None else 0
+
+        # Self Attention Projection
+        q_1 = self.q_proj(x)
+        k_tokens = self.k_proj(x)
+        v_tokens = self.v_proj(x)
+
+        # Reshape Self
+        q_1 = q_1.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        k_tokens = k_tokens.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+        v_tokens = v_tokens.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
+
+        attn_scores_list = []
+        # Score: Self
+        attn_scores_list.append(torch.matmul(q_1, k_tokens.transpose(-2, -1)))
+
+        # Process Task (Vision)
+        v_task_reshaped = None
+        if h_t is not None:
+            k_task = self.k_proj(h_t)
+            v_task = self.v_proj(h_t)
+            k_task = k_task.view(B, K_task, self.num_heads, self.head_dim).transpose(1, 2)
+            v_task_reshaped = v_task.view(B, K_task, self.num_heads, self.head_dim).transpose(1, 2)
+            
+            attn_scores_list.append(torch.matmul(q_1, k_task.transpose(-2, -1)))
+
+        # Process Adapter (Action/Proprio)
+        v_cond_reshaped = None
+        if h_cond is not None:
+            k_cond = self.k_proj(h_cond)
+            v_cond = self.v_proj(h_cond)
+            k_cond = k_cond.view(B, K_cond, self.num_heads, self.head_dim).transpose(1, 2)
+            v_cond_reshaped = v_cond.view(B, K_cond, self.num_heads, self.head_dim).transpose(1, 2)
+
+            attn_scores_list.append(torch.matmul(q_1, k_cond.transpose(-2, -1)) * ratio_g)
+
+        # Softmax
+        attn_scores = torch.cat(attn_scores_list, dim=-1)
+        attn_scores = attn_scores / math.sqrt(self.head_dim)
+        attn_weights = torch.softmax(attn_scores, dim=-1)
+
+        # Combine Values
+        v_combined_list = [v_tokens]
+        if v_task_reshaped is not None: v_combined_list.append(v_task_reshaped)
+        if v_cond_reshaped is not None: v_combined_list.append(v_cond_reshaped)
+        
+        v_combined = torch.cat(v_combined_list, dim=2)
+
+        # Output Projection
+        output = torch.matmul(attn_weights, v_combined)
+        output = output.transpose(1, 2).contiguous().view(B, T, C)
+        output = self.o_proj(output)
+
+        x = self.ffn(output + x)
+        return x
+
+
+class MLPResNetBlock_Pro(nn.Module):
+    """
+    MLP ResNet Block Pro with RoPE and dimension checks.
+    """
+    def __init__(self, dim, num_heads=8):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+
+        self.ffn = nn.Sequential(
+            nn.LayerNorm(dim),
+            nn.Linear(dim, dim),
+            nn.ReLU(),
+            )
+
+        self.q_proj = nn.Linear(dim, dim)
+        self.k_self = nn.Linear(dim, dim)
+        self.v_self = nn.Linear(dim, dim)
+
+        self.k_adapter = nn.Linear(dim, dim)
+        self.v_adapter = nn.Linear(dim, dim)
+
+        self.k_task = nn.Linear(dim, dim)
+        self.v_task = nn.Linear(dim, dim)
+
+        self.o_proj = nn.Linear(dim, dim)
+        self.gating_factor = nn.Parameter(torch.zeros(1))
+        self.rope = RotaryPositionEmbedding(self.head_dim)
+
+    def forward(self, x, h_a=None, h_t=None, p=None):
+        g = self.gating_factor
+        ratio_g = torch.tanh(g)
+
+        # 1. Prepare Conditions
+        cond_list = []
+        if h_a is not None:
+            if h_a.dim() == 2: h_a = h_a.unsqueeze(1)
+            cond_list.append(h_a)
+        if p is not None:
+            if p.dim() == 2: p = p.unsqueeze(1)
+            cond_list.append(p)
+        h_adapter = torch.cat(cond_list, dim=1) if cond_list else None
+
+        if h_t is not None:
+            if h_t.dim() == 2: h_t = h_t.unsqueeze(1)
+
+        B, T, C = x.shape
+        K_a = h_adapter.size(1) if h_adapter is not None else 0
+        K_t = h_t.size(1) if h_t is not None else 0
+
+        def to_heads(t, L):
+            return t.view(B, L, self.num_heads, self.head_dim).transpose(1, 2)
+
+        # Self Attention
+        q_1 = self.q_proj(x)
+        k_self = self.k_self(x)
+        v_self = self.v_self(x)
+        
+        q_1 = to_heads(q_1, T)
+        k_self = to_heads(k_self, T)
+        v_self = to_heads(v_self, T)
+
+        # RoPE: Self
+        cos_main, sin_main = self.rope(seq_len=T, device=x.device, dtype=x.dtype)
+        q_1, k_self = apply_rope(q_1, k_self, cos_main, sin_main)
+
+        attn_scores = [torch.matmul(q_1, k_self.transpose(-2, -1))]
+        v_list = [v_self]
+
+        # Adapter Attention (Action/Proprio) - With RoPE
+        if h_adapter is not None:
+            k_adp = self.k_adapter(h_adapter)
+            v_adp = self.v_adapter(h_adapter)
+            k_adp, v_adp = to_heads(k_adp, K_a), to_heads(v_adp, K_a)
+            
+            cos_a, sin_a = self.rope(seq_len=K_a, device=x.device, dtype=x.dtype)
+            _, k_adp = apply_rope(k_adp, k_adp, cos_a, sin_a)
+            
+            attn_scores.append(torch.matmul(q_1, k_adp.transpose(-2, -1)))
+            v_list.append(v_adp)
+
+        # Task Attention (Vision) - With RoPE & Gating
+        if h_t is not None:
+            k_tsk = self.k_task(h_t)
+            v_tsk = self.v_task(h_t)
+            k_tsk, v_tsk = to_heads(k_tsk, K_t), to_heads(v_tsk, K_t)
+
+            cos_t, sin_t = self.rope(seq_len=K_t, device=x.device, dtype=x.dtype)
+            _, k_tsk = apply_rope(k_tsk, k_tsk, cos_t, sin_t)
+
+            attn_scores.append(torch.matmul(q_1, k_tsk.transpose(-2, -1)) * ratio_g)
+            v_list.append(v_tsk)
+
+        # Merge & Output
+        attn_scores = torch.cat(attn_scores, dim=-1) / math.sqrt(self.head_dim)
+        attn_weights = torch.softmax(attn_scores, dim=-1)
+        
+        v_combined = torch.cat(v_list, dim=2)
+        output = torch.matmul(attn_weights, v_combined)
+        
+        output = output.transpose(1, 2).contiguous().view(B, T, C)
+        output = self.o_proj(output)
+
+        x = self.ffn(output + x)
+        return x
+
+
+def get_action_model(config=None):
+    return VLA_Adapter_L1RegressionActionHead(
+        full_config=config
+    )
\ No newline at end of file
diff --git a/code/model/modules/action_model/__init__.py b/code/model/modules/action_model/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6de094b9274ded3b4326e675726b37021cd29ef4
--- /dev/null
+++ b/code/model/modules/action_model/__init__.py
@@ -0,0 +1,40 @@
+# Modified from OpenAI's diffusion repos
+#     GLIDE: https://github.com/openai/glide-text2im/blob/main/glide_text2im/gaussian_diffusion.py
+#     ADM:   https://github.com/openai/guided-diffusion/blob/main/guided_diffusion
+#     IDDPM: https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+
+from .DiT_modules import gaussian_diffusion as gd
+from .DiT_modules.respace import SpacedDiffusion, space_timesteps
+
+
+def create_diffusion(
+    timestep_respacing,
+    noise_schedule="linear",
+    use_kl=False,
+    sigma_small=False,
+    predict_xstart=False,
+    learn_sigma=True,
+    rescale_learned_sigmas=False,
+    diffusion_steps=1000,
+):
+    betas = gd.get_named_beta_schedule(noise_schedule, diffusion_steps)
+    if use_kl:
+        loss_type = gd.LossType.RESCALED_KL
+    elif rescale_learned_sigmas:
+        loss_type = gd.LossType.RESCALED_MSE
+    else:
+        loss_type = gd.LossType.MSE
+    if timestep_respacing is None or timestep_respacing == "":
+        timestep_respacing = [diffusion_steps]
+    return SpacedDiffusion(
+        use_timesteps=space_timesteps(diffusion_steps, timestep_respacing),
+        betas=betas,
+        model_mean_type=(gd.ModelMeanType.EPSILON if not predict_xstart else gd.ModelMeanType.START_X),
+        model_var_type=(
+            (gd.ModelVarType.FIXED_LARGE if not sigma_small else gd.ModelVarType.FIXED_SMALL)
+            if not learn_sigma
+            else gd.ModelVarType.LEARNED_RANGE
+        ),
+        loss_type=loss_type,
+        # rescale_timesteps=rescale_timesteps,
+    )
diff --git a/code/model/modules/action_model/__pycache__/ActionModel.cpython-310.pyc b/code/model/modules/action_model/__pycache__/ActionModel.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35d296053239a7e9d2902fe45f523ca6dff5d7ce
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/ActionModel.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-310.pyc b/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3e28a6c2353ffb7a8b953fedfec5ef212ed7221
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-311.pyc b/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68e44fb11ea1e72b7a8a87d44fee6c9c4bfda160
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/ActionModel_FM.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2599e426a680c751fdccfd1e272a814b176bc85d
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1448d5b3cb7448ea6aa3a36e162b06dba62c073
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/DiTActionHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11a5c5c32870746571f4d618fcf206dfed10a62f
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3530c993c3d2e0bb89bb514c173c08d6fbcae863
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/GR00T_ActionHeader_moh.cpython-310.pyc b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader_moh.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..762886e11074bd26d737e28d768842d1a4196206
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/GR00T_ActionHeader_moh.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b737b79d509f4f45663493bc4b1ef11906f93730
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..097a26b905029acc1e4adb0be14686776f1ec183
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/LayerwiseFM_ActionHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e20f2b70079c9dae596577afd0a0b7904d66abb
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b119c38a7f98244348847708b992ac5fd6925d8
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/MLP_ActionHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..131a076181ddab627c003d42581d73c5105274af
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..80d5247c7dac6d90fc613b21ac71af7eed04d474
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/VLA_AdapterHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/__init__.cpython-310.pyc b/code/model/modules/action_model/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16025718a293d74337af4c6335741fef5f8316dd
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/__init__.cpython-311.pyc b/code/model/modules/action_model/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b535d6e96c4312b1759d2a80b916c3bf3790e57f
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/configuration_actionmodel.cpython-310.pyc b/code/model/modules/action_model/__pycache__/configuration_actionmodel.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1b830deeaa20ba8770b7eea444932132b17d784
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/configuration_actionmodel.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-310.pyc b/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdf957af8bf1636c2689b69958f83bd130b30bc7
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-311.pyc b/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40a5c6ce9253aa5e96a31f873570836574cc1871
Binary files /dev/null and b/code/model/modules/action_model/__pycache__/fast_ActionHeader.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/configuration_actionmodel.py b/code/model/modules/action_model/configuration_actionmodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..321595d555354621a6e204b36f9d16feaa935da9
--- /dev/null
+++ b/code/model/modules/action_model/configuration_actionmodel.py
@@ -0,0 +1,313 @@
+# Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Qwen3 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig, layer_type_validation
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ActionModelConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Qwen3Model`]. It is used to instantiate a
+    Qwen3 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of
+    Qwen3-8B [Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 151936):
+            Vocabulary size of the Qwen3 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Qwen3Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 22016):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 32):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `32`.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 32768):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        use_sliding_window (`bool`, *optional*, defaults to `False`):
+            Whether to use sliding window attention.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention (SWA) window size. If not specified, will default to `4096`.
+        max_window_layers (`int`, *optional*, defaults to 28):
+            The number of layers using full attention. The first `max_window_layers` layers will use full attention, while any
+            additional layer afterwards will use SWA (Sliding Window Attention).
+        layer_types (`list`, *optional*):
+            Attention pattern for each layer.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import Qwen3Model, Qwen3Config
+
+    >>> # Initializing a Qwen3 style configuration
+    >>> configuration = Qwen3Config()
+
+    >>> # Initializing a model from the Qwen3-8B style configuration
+    >>> model = Qwen3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qwen3"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    # Default tensor parallel plan for base model `Qwen3`
+    base_model_tp_plan = {
+        "action_encoder.*.self_attn.q_proj": "colwise",
+        "action_encoder.*.self_attn.k_proj": "colwise",
+        "action_encoder.*.self_attn.v_proj": "colwise",
+        "action_encoder.*.self_attn.o_proj": "rowwise",
+        "action_encoder.*.mlp.gate_proj": "colwise",
+        "action_encoder.*.mlp.up_proj": "colwise",
+        "action_encoder.*.mlp.down_proj": "rowwise",
+
+        "action_decoder.*.self_attn.q_proj": "colwise",
+        "action_decoder.*.self_attn.k_proj": "colwise",
+        "action_decoder.*.self_attn.v_proj": "colwise",
+        "action_decoder.*.self_attn.o_proj": "rowwise",
+        "action_decoder.*.mlp.gate_proj": "colwise",
+        "action_decoder.*.mlp.up_proj": "colwise",
+        "action_decoder.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "action_encoder": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "action_decoder": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        action_size=64,
+        state_size=96,
+        hidden_size=1024,
+        intermediate_size=3072,
+        dataset_vocab_size=256,  # 假设有100个数据集
+        num_data_tokens=8,
+        mask_ratio=0.25,  # Action Mask 比例
+        # Mask ratio sampling for DAE masking:
+        # - "fixed": use `mask_ratio` for all trajectories
+        # - "uniform_per_traj": sample per trajectory in [mask_ratio_min, mask_ratio_max]
+        mask_ratio_mode="uniform_per_traj",
+        mask_ratio_min=0.25,
+        mask_ratio_max=0.75,
+        # Loss mode: whether to add reconstruction loss for masked-action view (two-view training).
+        # - False: only current action reconstruction (single view).
+        # - True: current action recon + masked action recon (two views, two recon losses).
+        use_masked_action_recon=False,
+        # Optional contrastive loss on action embedding (InfoNCE). When True, adds contrastive
+        # between clean and masked embeddings; typically used together with use_masked_action_recon.
+        use_contrastive_loss=False,
+        # Optional domain-adversarial loss (GRL + MLP): predict dataset_id from embedding, then reverse
+        # gradient so encoder learns domain-invariant features. Use with multi-domain data.
+        use_domain_adversarial=False,
+        domain_adversarial_weight=0.1,
+        domain_adversarial_lambda=1.0,
+        domain_adversarial_mlp_hidden=None,
+        contrastive_temperature=0.07,
+        contrastive_weight=0.1,
+        contrastive_use_proj=False,
+        contrastive_proj_dim=256,
+        contrastive_use_distributed=True,
+        state_drop_prob=0.5,  # State Dropout 比例
+        min_action_len=5,  # 最小切片长度
+        num_encoder_layers=28,
+        num_decoder_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=8,
+        head_dim=128,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        max_action_chunk_size=256,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        use_sliding_window=False,
+        sliding_window=4096,
+        max_window_layers=28,
+        # layer_types=None,
+        attention_dropout=0.0,
+        use_vae_reparameterization=False,
+        # ---- Qwen3 pretrained init (optional) ----
+        # Can be a HuggingFace model id (e.g. "Qwen/Qwen3-0.6B") or a local checkpoint folder.
+        qwen3_pretrained_name_or_path="Qwen/Qwen3-0.6B",
+        # Copy transformer block weights into action_encoder / action_decoder.
+        qwen3_init_action_encoder=True,
+        qwen3_init_action_decoder=True,
+        # Copy final RMSNorm weights if shape matches.
+        qwen3_init_norm=True,
+        # Which source layer index maps to target layer 0.
+        qwen3_encoder_layer_offset=0,
+        qwen3_decoder_layer_offset=0,
+        **kwargs,
+    ):
+        self.action_size = action_size
+        self.state_size = state_size
+        self.max_position_embeddings = max_position_embeddings
+        self.max_action_chunk_size = max_action_chunk_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        
+        self.dataset_vocab_size = dataset_vocab_size
+        self.num_data_tokens = num_data_tokens
+        self.mask_ratio = mask_ratio
+        self.mask_ratio_mode = mask_ratio_mode
+        self.mask_ratio_min = mask_ratio_min
+        self.mask_ratio_max = mask_ratio_max
+        self.use_masked_action_recon = use_masked_action_recon
+        self.use_contrastive_loss = use_contrastive_loss
+        self.use_domain_adversarial = use_domain_adversarial
+        self.domain_adversarial_weight = domain_adversarial_weight
+        self.domain_adversarial_lambda = domain_adversarial_lambda
+        self.domain_adversarial_mlp_hidden = (
+            domain_adversarial_mlp_hidden if domain_adversarial_mlp_hidden is not None else hidden_size
+        )
+        self.contrastive_temperature = contrastive_temperature
+        self.contrastive_weight = contrastive_weight
+        self.contrastive_use_proj = contrastive_use_proj
+        self.contrastive_proj_dim = contrastive_proj_dim
+        self.contrastive_use_distributed = contrastive_use_distributed
+        self.state_drop_prob = state_drop_prob
+        self.min_action_len = min_action_len
+        
+        self.num_encoder_layers = num_encoder_layers
+        self.num_decoder_layers = num_decoder_layers
+        self.num_attention_heads = num_attention_heads
+        self.use_sliding_window = use_sliding_window
+        self.sliding_window = sliding_window if self.use_sliding_window else None
+        self.max_window_layers = max_window_layers
+        self.use_vae_reparameterization = use_vae_reparameterization
+        print(f"use_vae_reparameterization? {use_vae_reparameterization}")
+
+        # Qwen3 pretrained init (optional)
+        self.qwen3_pretrained_name_or_path = qwen3_pretrained_name_or_path
+        self.qwen3_init_action_encoder = qwen3_init_action_encoder
+        self.qwen3_init_action_decoder = qwen3_init_action_decoder
+        self.qwen3_init_norm = qwen3_init_norm
+        self.qwen3_encoder_layer_offset = qwen3_encoder_layer_offset
+        self.qwen3_decoder_layer_offset = qwen3_decoder_layer_offset
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        # self.layer_types = layer_types
+        # if self.layer_types is None:
+        #     self.layer_types = [
+        #         "sliding_attention"
+        #         if self.sliding_window is not None and i >= self.max_window_layers
+        #         else "full_attention"
+        #         for i in range(self.num_hidden_layers)
+        #     ]
+        # layer_type_validation(self.layer_types, self.num_hidden_layers)
+
+        super().__init__(
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["ActionModelConfig"]
\ No newline at end of file
diff --git a/code/model/modules/action_model/fast_ActionHeader.py b/code/model/modules/action_model/fast_ActionHeader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7a31781e7bfc5ec4009f899ffea51526fcabe76
--- /dev/null
+++ b/code/model/modules/action_model/fast_ActionHeader.py
@@ -0,0 +1,115 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+"""Fast Action Tokenizer Adapter
+"this file is adapted from https://huggingface.co/physical-intelligence/fast"
+
+Overview:
+    This module encapsulates a lightweight "action → language model-readable sequence" converter (Fast_Action_Tokenizer).
+    Its core objective is to convert continuous/discrete raw robot actions (raw_actions) into
+    pseudo-natural language token strings like <robot_action_12><robot_action_3><robot_action_87> ...
+    This facilitates direct integration into multimodal large models (VLM/LLM) dialogue templates,
+    leveraging their language modeling capabilities for action prediction.
+"""
+
+import torch.nn as nn
+from typing import List, Dict, Any, Callable, Optional
+import os
+import numpy as np
+from transformers import AutoProcessor
+
+
+
+class Fast_Action_Tokenizer(nn.Module):
+    """One MLP ResNet block with a residual connection."""
+    def __init__(self, fast_tokenizer_name="playground/Pretrained_models/fast"):
+        super().__init__()
+        
+        self.fast_tokenizer = AutoProcessor.from_pretrained(
+            fast_tokenizer_name, trust_remote_code=True
+        ) # load https://huggingface.co/physical-intelligence/fast
+
+
+    def encoder_action2fastoken(self, raw_actions):
+        # x: (batch_size, chunck, dim)
+        batch_actions = np.stack(raw_actions, axis=0)  # (B, T, D)
+        batch_fast_tokens = self.fast_tokenizer(batch_actions)
+
+        return batch_fast_tokens # List[str]
+    
+    def decoder_action(self, generated_ids):
+        # api https://huggingface.co/physical-intelligence/fast
+        # return: (batch_size, chunck, dim)
+        pred_actions = self.fast_tokenizer.decode([generated_ids - self._ACTION_TOKEN_MIN])
+        return pred_actions
+    
+
+    def fit_tokenizer_on_datasets(self, action_dataset, datasets_path="<your_local_path>", ):
+        # 如果 datasets_path 存在， 直接读取
+        if os.path.exists(datasets_path):
+
+            self.fast_tokenizer = AutoProcessor.from_pretrained(
+            datasets_path, trust_remote_code=True
+        )
+            return
+        else:
+            # 如果不存在，Fit the tokenizer on the new dataset
+            new_tokenizer = self.fast_tokenizer.tokenizer.fit(action_dataset)
+            self.fast_tokenizer = new_tokenizer
+
+            # Save the new tokenizer, optionally push it to the Hugging Face model hub
+            self.fast_tokenizer.save_pretrained(datasets_path)
+
+
+def get_action_model(config=None):
+    """
+    Factory: build ActionModel from global framework config.
+
+    Args:
+        config: Global config (expects config.framework.action_model namespace).
+    Returns:
+        ActionModel: Initialized diffusion action head.
+    """
+    action_model = Fast_Action_Tokenizer()
+
+    return action_model
+
+
+def start_debugpy_once():
+    """start debugpy once"""
+    import debugpy
+    if getattr(start_debugpy_once, "_started", False):
+        return
+    debugpy.listen(("0.0.0.0", 10094))
+    print("🔍 Waiting for VSCode attach on 0.0.0.0:10094 ...")
+    debugpy.wait_for_client()
+    start_debugpy_once._started = True
+
+if __name__ == "__main__":
+
+    start_debugpy_once()
+
+    fast_tokenizer_name = "physical-intelligence/fast"
+    fast_tokenizer = Fast_Action_Tokenizer(fast_tokenizer_name=fast_tokenizer_name)
+    raw_actions = [np.random.randn(16, 7), np.random.randn(16, 7)]
+
+    # Load the tokenizer from the Hugging Face hub
+    tokenizer = AutoProcessor.from_pretrained(fast_tokenizer_name, trust_remote_code=True)
+
+    # basic test
+    # Tokenize & decode action chunks (we use dummy data here)
+    action_data = np.random.rand(2, 16, 7)    # one batch of action chunks
+    tokens = tokenizer(action_data)              # tokens = list[int]
+    decoded_actions = tokenizer.decode(tokens)
+
+    # self func test
+    vlm_tokens = fast_tokenizer.encoder_action2vlmtoken(raw_actions)
+    print(vlm_tokens)
+    pred_actions = fast_tokenizer.decoder_action(np.array([12,3,45,87]))
+    print(pred_actions)
+
+
+
+
+
diff --git a/code/model/modules/action_model/flow_matching_head/__init__.py b/code/model/modules/action_model/flow_matching_head/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3159bfe65645499015bd92609b99d476d69544e9
--- /dev/null
+++ b/code/model/modules/action_model/flow_matching_head/__init__.py
@@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-310.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e49d390226db62272e20be64e90756c0e5323b4
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-311.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58d0987fb43cf056803b2b00e26e2a47850c8be9
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-310.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba1f00b9942b7d830dd1b03586f092ec9035147f
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-311.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50bb5c3f4f12cac103017f9fe730fcad25edd0ee
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/action_encoder.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-310.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a42fa9056fa1f9e85490ef05d84ccfd7e5c9634
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-310.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-311.pyc b/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5489faac3769a9d5ebebc868695058ea1a36cdb
Binary files /dev/null and b/code/model/modules/action_model/flow_matching_head/__pycache__/cross_attention_dit.cpython-311.pyc differ
diff --git a/code/model/modules/action_model/flow_matching_head/action_encoder.py b/code/model/modules/action_model/flow_matching_head/action_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27bbbb745be683c7cbc947b4137ffceb9247df0
--- /dev/null
+++ b/code/model/modules/action_model/flow_matching_head/action_encoder.py
@@ -0,0 +1,101 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.nn as nn
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+class SinusoidalPositionalEncoding(nn.Module):
+    """
+    Produces a sinusoidal encoding of shape (B, T, w)
+    given timesteps of shape (B, T).
+    """
+
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+
+    def forward(self, timesteps):
+        # timesteps: shape (B, T)
+        # We'll compute sin/cos frequencies across dim T
+        timesteps = timesteps.float()  # ensure float
+
+        B, T = timesteps.shape
+        device = timesteps.device
+
+        half_dim = self.embedding_dim // 2
+        # typical log space frequencies for sinusoidal encoding
+        exponent = -torch.arange(half_dim, dtype=torch.float, device=device) * (
+            torch.log(torch.tensor(10000.0)) / half_dim
+        )
+        # Expand timesteps to (B, T, 1) then multiply
+        freqs = timesteps.unsqueeze(-1) * exponent.exp()  # (B, T, half_dim)
+
+        sin = torch.sin(freqs)
+        cos = torch.cos(freqs)
+        enc = torch.cat([sin, cos], dim=-1)  # (B, T, w)
+
+        return enc
+
+
+class ActionEncoder(nn.Module):
+    def __init__(self, action_dim, hidden_size):
+        super().__init__()
+        self.hidden_size = hidden_size
+
+        # W1: R^{w x d}, W2: R^{w x 2w}, W3: R^{w x w}
+        self.W1 = nn.Linear(action_dim, hidden_size)  # (d -> w)
+        self.W2 = nn.Linear(2 * hidden_size, hidden_size)  # (2w -> w)
+        self.W3 = nn.Linear(hidden_size, hidden_size)  # (w -> w)
+
+        self.pos_encoding = SinusoidalPositionalEncoding(hidden_size)
+
+    def forward(self, actions, timesteps):
+        """
+        actions:   shape (B, T, action_dim)
+        timesteps: shape (B,)  -- a single scalar per batch item
+        returns:   shape (B, T, hidden_size)
+        """
+        B, T, _ = actions.shape
+
+        # 1) Expand each batch's single scalar time 'tau' across all T steps
+        #    so that shape => (B, T)
+        #    e.g. if timesteps is (B,), replicate across T
+        if timesteps.dim() == 1 and timesteps.shape[0] == B:
+            # shape (B,) => (B,T)
+            timesteps = timesteps.unsqueeze(1).expand(-1, T)
+        else:
+            raise ValueError(
+                "Expected `timesteps` to have shape (B,) so we can replicate across T."
+            )
+
+        # 2) Standard action MLP step for shape => (B, T, w)
+        a_emb = self.W1(actions)
+
+        # 3) Get the sinusoidal encoding (B, T, w)
+        tau_emb = self.pos_encoding(timesteps).to(dtype=a_emb.dtype)
+
+        # 4) Concat along last dim => (B, T, 2w), then W2 => (B, T, w), swish
+        x = torch.cat([a_emb, tau_emb], dim=-1)
+        x = swish(self.W2(x))
+
+        # 5) Finally W3 => (B, T, w)
+        x = self.W3(x)
+
+        return x
diff --git a/code/model/modules/action_model/flow_matching_head/cross_attention_dit.py b/code/model/modules/action_model/flow_matching_head/cross_attention_dit.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bb9a0886a51f5084991eaef5fd9f4f46f388b8a
--- /dev/null
+++ b/code/model/modules/action_model/flow_matching_head/cross_attention_dit.py
@@ -0,0 +1,377 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+from diffusers import ConfigMixin, ModelMixin
+from diffusers.configuration_utils import register_to_config
+from diffusers.models.attention import Attention, FeedForward
+from diffusers.models.embeddings import (
+    SinusoidalPositionalEmbedding,
+    TimestepEmbedding,
+    Timesteps,
+)
+from torch import nn
+
+
+class TimestepEncoder(nn.Module):
+    def __init__(self, embedding_dim, compute_dtype=torch.float32):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=1)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+
+    def forward(self, timesteps):
+        dtype = next(self.parameters()).dtype
+        timesteps_proj = self.time_proj(timesteps).to(dtype)
+        timesteps_emb = self.timestep_embedder(timesteps_proj)  # (N, D)
+        return timesteps_emb
+
+
+class AdaLayerNorm(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        chunk_dim: int = 0,
+    ):
+        super().__init__()
+        self.chunk_dim = chunk_dim
+        output_dim = embedding_dim * 2
+        self.silu = nn.SiLU()
+        self.linear = nn.Linear(embedding_dim, output_dim)
+        self.norm = nn.LayerNorm(output_dim // 2, norm_eps, norm_elementwise_affine)
+
+    def forward(
+        self,
+        x: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        temb = self.linear(self.silu(temb))
+        scale, shift = temb.chunk(2, dim=1)
+        x = self.norm(x) * (1 + scale[:, None]) + shift[:, None]
+        return x
+
+
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        attention_bias: bool = False,
+        upcast_attention: bool = False,
+        norm_elementwise_affine: bool = True,
+        norm_type: str = "layer_norm",  # 'layer_norm', 'ada_norm', 'ada_norm_zero', 'ada_norm_single', 'ada_norm_continuous', 'layer_norm_i2vgen'
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
+        attention_type: str = "default",
+        positional_embeddings: Optional[str] = None,
+        num_positional_embeddings: Optional[int] = None,
+        ff_inner_dim: Optional[int] = None,
+        ff_bias: bool = True,
+        attention_out_bias: bool = True,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.dropout = dropout
+        self.cross_attention_dim = cross_attention_dim
+        self.activation_fn = activation_fn
+        self.attention_bias = attention_bias
+        self.norm_elementwise_affine = norm_elementwise_affine
+        self.positional_embeddings = positional_embeddings
+        self.num_positional_embeddings = num_positional_embeddings
+        self.norm_type = norm_type
+
+        if positional_embeddings and (num_positional_embeddings is None):
+            raise ValueError(
+                "If `positional_embedding` type is defined, `num_positition_embeddings` must also be defined."
+            )
+
+        if positional_embeddings == "sinusoidal":
+            self.pos_embed = SinusoidalPositionalEmbedding(
+                dim, max_seq_length=num_positional_embeddings
+            )
+        else:
+            self.pos_embed = None
+
+        # Define 3 blocks. Each block has its own normalization layer.
+        # 1. Self-Attn
+        if norm_type == "ada_norm":
+            self.norm1 = AdaLayerNorm(dim)
+        else:
+            self.norm1 = nn.LayerNorm(dim, elementwise_affine=norm_elementwise_affine, eps=norm_eps)
+
+        self.attn1 = Attention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim,
+            upcast_attention=upcast_attention,
+            out_bias=attention_out_bias,
+        )
+
+        # 3. Feed-forward
+        self.norm3 = nn.LayerNorm(dim, norm_eps, norm_elementwise_affine)
+        self.ff = FeedForward(
+            dim,
+            dropout=dropout,
+            activation_fn=activation_fn,
+            final_dropout=final_dropout,
+            inner_dim=ff_inner_dim,
+            bias=ff_bias,
+        )
+        if final_dropout:
+            self.final_dropout = nn.Dropout(dropout)
+        else:
+            self.final_dropout = None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.LongTensor] = None,
+    ) -> torch.Tensor:
+
+        # 0. Self-Attention
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, temb)
+        else:
+            norm_hidden_states = self.norm1(hidden_states)
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        attn_output = self.attn1( 
+            norm_hidden_states, 
+            encoder_hidden_states=encoder_hidden_states,
+            attention_mask=encoder_attention_mask, #@JinhuiYE original attention_mask=attention_mask
+        )
+        if self.final_dropout:
+            attn_output = self.final_dropout(attn_output)
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 4. Feed-forward
+        norm_hidden_states = self.norm3(hidden_states)
+        ff_output = self.ff(norm_hidden_states)
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+        return hidden_states
+
+
+class DiT(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    # register_to_config 的作用是创建类的时候会自动把传入的参数注册到 config 中，这样后续调用的时候可以通过 self.config.xxx 调用 还不是 self.xxx
+    @register_to_config # 去看一下这个的作用 --> 将传入的参数注册到配置中 TODO 改为我们的单例模式, 写一个 能够merge 的 @merge_pram_config
+    def __init__(
+        self,
+        num_attention_heads: int = 8,
+        attention_head_dim: int = 64,
+        output_dim: int = 26,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        attention_bias: bool = True,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        norm_type: str = "ada_norm",
+        norm_elementwise_affine: bool = False,
+        norm_eps: float = 1e-5,
+        max_num_positional_embeddings: int = 512,
+        compute_dtype=torch.float32,
+        final_dropout: bool = True,
+        positional_embeddings: Optional[str] = "sinusoidal",
+        interleave_self_attention=False,
+        cross_attention_dim: Optional[int] = None,
+        **kwargs
+    ):
+        super().__init__()
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.gradient_checkpointing = False
+
+        # Timestep encoder
+        #  self.config.compute_dtype 可能不存在，要提前处理
+        compute_dtype = getattr(self.config, 'compute_dtype', torch.float32)
+        self.timestep_encoder = TimestepEncoder( # TODO BUG, train 的时候 self.config.compute_dtype 不会报错， 但是 eval 的时候会
+            embedding_dim=self.inner_dim, compute_dtype=compute_dtype
+        )
+
+        all_blocks = []
+        for idx in range(self.config.num_layers):
+
+            use_self_attn = idx % 2 == 1 and interleave_self_attention
+            curr_cross_attention_dim = cross_attention_dim if not use_self_attn else None
+
+            all_blocks += [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    norm_type=norm_type,
+                    norm_elementwise_affine=self.config.norm_elementwise_affine,
+                    norm_eps=self.config.norm_eps,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=self.config.max_num_positional_embeddings,
+                    final_dropout=final_dropout,
+                    cross_attention_dim=curr_cross_attention_dim,
+                )
+            ]
+        self.transformer_blocks = nn.ModuleList(all_blocks)
+
+        # Output blocks
+        self.norm_out = nn.LayerNorm(self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out_1 = nn.Linear(self.inner_dim, 2 * self.inner_dim)
+        self.proj_out_2 = nn.Linear(self.inner_dim, self.config.output_dim)
+        print(
+            "Total number of DiT parameters: ",
+            sum(p.numel() for p in self.parameters() if p.requires_grad),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # Shape: (B, T, D)
+        encoder_hidden_states: torch.Tensor,  # Shape: (B, S, D)
+        timestep: Optional[torch.LongTensor] = None,
+        return_all_hidden_states: bool = False,
+        encoder_attention_mask=None
+    ):
+        # Encode timesteps
+        temb = self.timestep_encoder(timestep)
+
+        # Process through transformer blocks - single pass through the blocks
+        hidden_states = hidden_states.contiguous()
+        encoder_hidden_states = encoder_hidden_states.contiguous()
+
+        all_hidden_states = [hidden_states]
+
+        # Process through transformer blocks
+        for idx, block in enumerate(self.transformer_blocks):
+            if idx % 2 == 1 and self.config.interleave_self_attention:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=None,
+                    encoder_attention_mask=None,
+                    temb=temb,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states,
+                    attention_mask=None,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    temb=temb,
+                )
+            all_hidden_states.append(hidden_states)
+
+        # Output processing
+        conditioning = temb
+        shift, scale = self.proj_out_1(F.silu(conditioning)).chunk(2, dim=1)
+        hidden_states = self.norm_out(hidden_states) * (1 + scale[:, None]) + shift[:, None]
+        if return_all_hidden_states:
+            return self.proj_out_2(hidden_states), all_hidden_states
+        else:
+            return self.proj_out_2(hidden_states)
+
+
+class SelfAttentionTransformer(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 8,
+        attention_head_dim: int = 64,
+        output_dim: int = 26,
+        num_layers: int = 12,
+        dropout: float = 0.1,
+        attention_bias: bool = True,
+        activation_fn: str = "gelu-approximate",
+        num_embeds_ada_norm: Optional[int] = 1000,
+        upcast_attention: bool = False,
+        max_num_positional_embeddings: int = 512,
+        compute_dtype=torch.float32,
+        final_dropout: bool = True,
+        positional_embeddings: Optional[str] = "sinusoidal",
+        interleave_self_attention=False,
+    ):
+        super().__init__()
+
+        self.attention_head_dim = attention_head_dim
+        self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
+        self.gradient_checkpointing = False
+
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    self.inner_dim,
+                    self.config.num_attention_heads,
+                    self.config.attention_head_dim,
+                    dropout=self.config.dropout,
+                    activation_fn=self.config.activation_fn,
+                    attention_bias=self.config.attention_bias,
+                    upcast_attention=self.config.upcast_attention,
+                    positional_embeddings=positional_embeddings,
+                    num_positional_embeddings=self.config.max_num_positional_embeddings,
+                    final_dropout=final_dropout,
+                )
+                for _ in range(self.config.num_layers)
+            ]
+        )
+        print(
+            "Total number of SelfAttentionTransformer parameters: ",
+            sum(p.numel() for p in self.parameters() if p.requires_grad),
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,  # Shape: (B, T, D)
+        return_all_hidden_states: bool = False,
+    ):
+        # Process through transformer blocks - single pass through the blocks
+        hidden_states = hidden_states.contiguous()
+        all_hidden_states = [hidden_states]
+
+        # Process through transformer blocks
+        for idx, block in enumerate(self.transformer_blocks):
+            hidden_states = block(hidden_states)
+            all_hidden_states.append(hidden_states)
+
+        if return_all_hidden_states:
+            return hidden_states, all_hidden_states
+        else:
+            return hidden_states
diff --git a/code/model/modules/dino_model/__pycache__/dino.cpython-310.pyc b/code/model/modules/dino_model/__pycache__/dino.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad6551c38bc4731daf01586c4827d6e7116ab23d
Binary files /dev/null and b/code/model/modules/dino_model/__pycache__/dino.cpython-310.pyc differ
diff --git a/code/model/modules/dino_model/__pycache__/dino.cpython-311.pyc b/code/model/modules/dino_model/__pycache__/dino.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5af7b1286c6ca9e91845602e886c46a5f609984
Binary files /dev/null and b/code/model/modules/dino_model/__pycache__/dino.cpython-311.pyc differ
diff --git a/code/model/modules/dino_model/dino.py b/code/model/modules/dino_model/dino.py
new file mode 100644
index 0000000000000000000000000000000000000000..e016b9b853e0e1409824b1f83d2b8e7600e06111
--- /dev/null
+++ b/code/model/modules/dino_model/dino.py
@@ -0,0 +1,144 @@
+"""
+DINOv2 vision backbone wrapper.
+
+Features:
+  - Loads DINOv2 variants via torch.hub (with local fallback)
+  - Exposes patch token features (x_norm_patchtokens)
+  - Provides preprocessing (resize + normalization) for multi-view PIL images
+  - Parallel per-view preprocessing using ThreadPoolExecutor
+"""
+
+from collections import OrderedDict
+import os
+
+from concurrent.futures import ThreadPoolExecutor
+import torch
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+from torchvision import transforms
+
+
+def apply_transform(view, transform):
+    return transform(view)
+
+
+# from llavavla.model.modules.dino_model.dino_transforms import make_classification_train_transform
+
+
+class DINOv2BackBone(nn.Module):
+    """
+    Thin wrapper around a DINOv2 model.
+
+    Args:
+        backone_name: DINOv2 model id (e.g. dinov2_vits14, dinov2_vitb14).
+        output_channels: (Unused placeholder; retained for future extension).
+
+    Attributes:
+        body: Loaded DINOv2 model.
+        num_channels: Feature dimension of patch tokens.
+        dino_transform: Preprocessing pipeline (resize + tensor + normalize).
+    """
+
+    def __init__(self, backone_name="dinov2_vits14", output_channels=1024) -> None:
+        super().__init__()
+        try:
+            self.body = torch.hub.load("facebookresearch/dinov2", backone_name)
+        except:
+            import traceback
+
+            traceback.print_exc()
+            print(f"Failed to load dinov2 from torch hub, loading from local")
+            TORCH_HOME = os.environ.get("TORCH_HOME", "~/.cache/torch/")
+            weights_path = os.path.expanduser(f"{TORCH_HOME}/hub/checkpoints/{backone_name}_pretrain.pth")
+
+            code_path = os.path.expanduser(f"{TORCH_HOME}/hub/facebookresearch_dinov2_main")
+
+            self.body = torch.hub.load(code_path, backone_name, source="local", pretrained=False)
+
+            state_dict = torch.load(weights_path)
+            self.body.load_state_dict(state_dict)
+        if backone_name == "dinov2_vits14":
+            self.num_channels = 384
+        elif backone_name == "dinov2_vitb14":
+            self.num_channels = 768
+        elif backone_name == "dinov2_vitl14":
+            self.num_channels = 1024
+        elif backone_name == "dinov2_vitg14":
+            self.num_channels = 1408
+        else:
+            raise NotImplementedError(f"DINOv2 backbone {backone_name} not implemented")
+        self.dino_transform = transforms.Compose(
+            [
+                transforms.Resize((224, 224)),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        )
+        # self.dino_transform = make_classification_train_transform()
+
+    # @torch.no_grad()
+    def forward(self, tensor):
+        """
+        Forward pass.
+
+        Args:
+            tensor: Image batch tensor [B*views, 3, H, W].
+
+        Returns:
+            torch.Tensor: Patch token features [B*views, N_tokens, C].
+        """
+        xs = self.body.forward_features(tensor)["x_norm_patchtokens"]
+
+        return xs  # B*views, token, dim
+
+    def prepare_dino_input(self, img_list):
+        """
+        Preprocess a batch of multi-view PIL image lists into a tensor suitable for DINO.
+
+        Args:
+            img_list: List of samples; each sample is List[PIL.Image] (multi-view).
+
+        Returns:
+            torch.Tensor: Flattened batch of shape [B * num_view, 3, H, W] on model device.
+        """
+        # img_list: is a list of [PIL], each representing multi views of the same example.
+        # refer to https://github.com/facebookresearch/dinov2/blob/main/dinov2/data/transforms.py
+
+        # use thread pool to parallel process each view
+        with ThreadPoolExecutor() as executor:
+            image_tensors = torch.stack(
+                [
+                    torch.stack(list(executor.map(lambda view: apply_transform(view, self.dino_transform), views)))
+                    for views in img_list
+                ]
+            )
+
+        # move the tensor to the device of DINO encoder
+        B, num_view, C, H, W = image_tensors.shape
+        image_tensors = image_tensors.view(B * num_view, C, H, W)
+        device = next(self.parameters()).device
+        image_tensors = image_tensors.to(device)
+
+        return image_tensors
+
+
+def get_dino_model(backone_name="dinov2_vits14") -> DINOv2BackBone:
+    """
+    Factory helper returning a configured DINOv2BackBone.
+
+    Args:
+        backone_name: DINOv2 variant name.
+
+    Returns:
+        DINOv2BackBone: Initialized backbone instance.
+    """
+    return DINOv2BackBone(backone_name)
+
+
+if __name__ == "__main__":
+    dino = DINOv2BackBone()
+    pass
diff --git a/code/model/modules/dino_model/dino_transforms.py b/code/model/modules/dino_model/dino_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5f252b50c54d58f160528c9f2b00fad47103c7
--- /dev/null
+++ b/code/model/modules/dino_model/dino_transforms.py
@@ -0,0 +1,91 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+
+from typing import Sequence
+
+import torch
+from torchvision import transforms
+
+
+class GaussianBlur(transforms.RandomApply):
+    """
+    Apply Gaussian Blur to the PIL image.
+    """
+
+    def __init__(self, *, p: float = 0.5, radius_min: float = 0.1, radius_max: float = 2.0):
+        # NOTE: torchvision is applying 1 - probability to return the original image
+        keep_p = 1 - p
+        transform = transforms.GaussianBlur(kernel_size=9, sigma=(radius_min, radius_max))
+        super().__init__(transforms=[transform], p=keep_p)
+
+
+class MaybeToTensor(transforms.ToTensor):
+    """
+    Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor, or keep as is if already a tensor.
+    """
+
+    def __call__(self, pic):
+        """
+        Args:
+            pic (PIL Image, numpy.ndarray or torch.tensor): Image to be converted to tensor.
+        Returns:
+            Tensor: Converted image.
+        """
+        if isinstance(pic, torch.Tensor):
+            return pic
+        return super().__call__(pic)
+
+
+# Use timm's names
+IMAGENET_DEFAULT_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_DEFAULT_STD = (0.229, 0.224, 0.225)
+
+
+def make_normalize_transform(
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Normalize:
+    return transforms.Normalize(mean=mean, std=std)
+
+
+# This roughly matches torchvision's preset for classification training:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L6-L44
+def make_classification_train_transform(
+    *,
+    crop_size: int = 224,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    hflip_prob: float = 0.5,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+):
+    transforms_list = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+    if hflip_prob > 0.0:
+        transforms_list.append(transforms.RandomHorizontalFlip(hflip_prob))
+    transforms_list.extend(
+        [
+            MaybeToTensor(),
+            make_normalize_transform(mean=mean, std=std),
+        ]
+    )
+    return transforms.Compose(transforms_list)
+
+
+# This matches (roughly) torchvision's preset for classification evaluation:
+#   https://github.com/pytorch/vision/blob/main/references/classification/presets.py#L47-L69
+def make_classification_eval_transform(
+    *,
+    resize_size: int = 256,
+    interpolation=transforms.InterpolationMode.BICUBIC,
+    crop_size: int = 224,
+    mean: Sequence[float] = IMAGENET_DEFAULT_MEAN,
+    std: Sequence[float] = IMAGENET_DEFAULT_STD,
+) -> transforms.Compose:
+    transforms_list = [
+        transforms.Resize(resize_size, interpolation=interpolation),
+        transforms.CenterCrop(crop_size),
+        MaybeToTensor(),
+        make_normalize_transform(mean=mean, std=std),
+    ]
+    return transforms.Compose(transforms_list)
diff --git a/code/model/modules/projector/QFormer.py b/code/model/modules/projector/QFormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..362c1f86bc61638a8454cc6e13d307dbb38fc8be
--- /dev/null
+++ b/code/model/modules/projector/QFormer.py
@@ -0,0 +1,185 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.distributed as dist
+
+
+class CrossAttentionBlock(nn.Module):
+    def __init__(self, hidden_dim, num_heads, mlp_ratio=4.0, dropout=0.1):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(hidden_dim)
+        self.cross_attn = nn.MultiheadAttention(
+            embed_dim=hidden_dim, num_heads=num_heads, batch_first=True, dropout=dropout
+        )
+
+        self.norm2 = nn.LayerNorm(hidden_dim)
+        self.mlp = nn.Sequential(
+            nn.Linear(hidden_dim, int(hidden_dim * mlp_ratio)),
+            nn.GELU(),
+            nn.Linear(int(hidden_dim * mlp_ratio), hidden_dim),
+        )
+        self.dropout = nn.Dropout(dropout)
+
+    def forward(self, query, encoder_hidden_state, encoder_attention_mask=None):
+        """
+        Cross-attention block forward.
+        Args:
+            query (Tensor): Shape [B, Q, D]. Learnable query tokens propagated across layers.
+            encoder_hidden_state (Tensor): Shape [B, L, D]. Features from one encoder layer.
+            encoder_attention_mask (Tensor | None): Shape [B, L]. 1/True=keep (visible), 0/False=mask. None disables masking.
+        Returns:
+            Tensor: Updated query tokens of shape [B, Q, D].
+        Details:
+            1. LayerNorm + MultiheadAttention (Q = query, K/V = encoder_hidden_state).
+            2. Residual path: query = query + attn_output, then add MLP residual.
+            3. Dropout is applied only on the MLP output.
+        """
+        q = self.norm1(query)
+        kv = encoder_hidden_state
+
+        if encoder_attention_mask is not None:
+            attn_mask = encoder_attention_mask.unsqueeze(1).to(dtype=torch.bool)  # [B, 1, L]
+        else:
+            attn_mask = None
+
+        attn_output, _ = self.cross_attn(q, kv, kv, key_padding_mask=attn_mask)
+        query = query + attn_output
+        query = query + self.dropout(self.mlp(self.norm2(query)))
+        return query
+
+
+class LayerwiseQFormer(nn.Module):
+    def __init__(
+        self, input_hidden_dim=2048, output_hidden_dim=768, num_query_tokens=64, num_layers=37, num_heads=8, config=None
+    ):
+        super().__init__()
+        self.input_hidden_dim = input_hidden_dim
+        self.output_hidden_dim = output_hidden_dim
+        self.num_query_tokens = num_query_tokens
+        self.num_layers = num_layers
+        self.config = config
+        # Project input to output dimension
+        self.proj = nn.Linear(input_hidden_dim, output_hidden_dim)
+        # Learnable query tokens
+        self.query_tokens = nn.Parameter(torch.randn(num_query_tokens, output_hidden_dim))
+
+        # Independent cross-attention blocks (one per encoder layer)
+        self.layers = nn.ModuleList([CrossAttentionBlock(output_hidden_dim, num_heads) for _ in range(num_layers)])
+
+    def forward(self, hidden_states_list, encoder_attention_mask=None):
+        """
+        Layer-wise Q-Former forward pass.
+        Args:
+            hidden_states_list (List[Tensor]): Length == num_layers. Each tensor is [B, L, Din], raw encoder layer outputs (before projection).
+            encoder_attention_mask (Tensor | None): Shape [B, L]. Same semantics as in CrossAttentionBlock.
+        Returns:
+            Tensor: Aggregated query tokens of shape [B, Q, Dout].
+        Pipeline:
+            1. Stack per-layer features to [B, N, L, Din] and linearly project to Dout.
+            2. Expand global learnable query tokens to batch: [B, Q, Dout].
+            3. Apply cross-attention layer-by-layer: each query attends only to the corresponding encoder layer features.
+        Notes:
+            - Asserts len(hidden_states_list) == num_layers.
+            - Does not modify gradient flow of hidden_states_list.
+        """
+        # hidden_states_list = self.scale_hook(hidden_states_list)
+
+        assert (
+            len(hidden_states_list) == self.num_layers
+        ), f"Expected {self.num_layers} layers, got {len(hidden_states_list)}"
+
+        B = hidden_states_list[0].size(0)
+        # Project input hidden states to output dimension
+        #    Result shape [B, N, L, Din]
+        hs = torch.stack(hidden_states_list, dim=1)
+        #    proj_hs shape [B, N, L, Dout]
+        proj_hs = self.proj(hs)
+        # 3) Unbind back to list, each element restored to [B, L, Dout]
+        hidden_states_list = list(proj_hs.unbind(dim=1))
+
+        # Expand query tokens for each batch
+        query = self.query_tokens.unsqueeze(0).expand(B, -1, -1)  # [B, Q, D]
+
+        # Iterate through each layer and apply cross-attention
+        for i, layer in enumerate(self.layers):
+            query = layer(query, hidden_states_list[i], encoder_attention_mask)
+
+        return query
+
+    def scale_hook(self, hidden_states_list, scale_factor=0.1):
+        """
+        (Experimental / optional) Register gradient scaling hooks on each layer's hidden states.
+        Args:
+            hidden_states_list (List[Tensor]): Per-layer feature tensors.
+            scale_factor (float): Gradient scaling factor (effective only if enabled via config and != 1).
+        Returns:
+            List[Tensor]: Original list (no data copy); hooks may be attached in-place.
+        Design:
+            - Currently returns immediately (guard condition hard-coded False) as a placeholder.
+            - Uses attribute _scaled_hook to avoid duplicate hook registration in distributed settings.
+            - Can be enabled later for gradient dampening or perturbation experiments.
+        Performance:
+            - Excessive hook registrations can hurt speed; kept lazy by default.
+        """
+        # --- 1. Register gradient scaling hooks on input hidden_states_list ---
+        if (
+            self.config
+            and hasattr(self.config.vla, "layer_qformer")
+            and hasattr(self.config.vla.layer_qformer, "grad_scale")
+            and self.config.vla.layer_qformer.grad_scale != 1
+        ):
+            scale_factor = self.config.vla.layer_qformer.grad_scale
+        else:
+            return hidden_states_list  # If grad_scale is not configured, return the original list
+
+        scaled_hidden_states_list = []
+        for hidden_states in hidden_states_list:
+            if hidden_states.requires_grad:
+                # Ensure gradient scaling is executed only once in distributed settings
+                if not hasattr(hidden_states, "_scaled_hook"):  # Prevent duplicate registration --> Seems to accelerate
+                    hook = lambda grad: grad * scale_factor
+                    hidden_states.register_hook(hook)
+                    hidden_states._scaled_hook = True  # Mark as processed
+            scaled_hidden_states_list.append(hidden_states)
+
+        return hidden_states_list
+
+
+import torch
+import torch.nn as nn
+
+
+def get_layerwise_qformer(num_heads=8, config=None, **kwargs):
+    """
+    Build a LayerwiseQFormer instance.
+    Args:
+        num_heads (int): Number of attention heads for CrossAttentionBlock.
+        config: Configuration object; must contain config.framework.layer_qformer with:
+            - qformer_start_layer / qformer_end_layer: range of layers (start inclusive, end exclusive).
+            - num_query_tokens: Number of learnable query tokens.
+            - input_dim: Input feature dimension (Din).
+            - ouptput_dim: Output feature dimension (Dout).
+        **kwargs: Reserved for future extensions (unused).
+    Returns:
+        LayerwiseQFormer: Instantiated model.
+    Notes:
+        - num_layers = end_layer - start_layer (half-open interval).
+        - Does not perform weight loading or device moves here.
+    """
+    # dist.barrier()
+    qformer_cfg = config.framework.layer_qformer
+    num_layers = qformer_cfg.qformer_end_layer - qformer_cfg.qformer_start_layer if config else num_layers
+    num_query_tokens = qformer_cfg.num_query_tokens
+    input_hidden_dim = config.framework.layer_qformer.input_dim
+    output_hidden_dim = config.framework.layer_qformer.ouptput_dim
+    num_query_tokens = qformer_cfg.num_query_tokens
+
+    qformer = LayerwiseQFormer(
+        input_hidden_dim=input_hidden_dim,
+        output_hidden_dim=output_hidden_dim,
+        num_query_tokens=num_query_tokens,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        config=config,
+    )
+    return qformer
diff --git a/code/model/modules/projector/__init__.py b/code/model/modules/projector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..139597f9cb07c5d48bed18984ec4747f4b4f3438
--- /dev/null
+++ b/code/model/modules/projector/__init__.py
@@ -0,0 +1,2 @@
+
+
diff --git a/code/model/modules/projector/__pycache__/QFormer.cpython-310.pyc b/code/model/modules/projector/__pycache__/QFormer.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35772c487ead49d8dcf28414456a93fa780edc6d
Binary files /dev/null and b/code/model/modules/projector/__pycache__/QFormer.cpython-310.pyc differ
diff --git a/code/model/modules/projector/__pycache__/QFormer.cpython-311.pyc b/code/model/modules/projector/__pycache__/QFormer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f087de7ad4ec108fe7aee230eef9367d7ecf7d5
Binary files /dev/null and b/code/model/modules/projector/__pycache__/QFormer.cpython-311.pyc differ
diff --git a/code/model/modules/projector/__pycache__/__init__.cpython-310.pyc b/code/model/modules/projector/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cfd8ec69faa9c6068bb02a1a7ed93f2152041f3
Binary files /dev/null and b/code/model/modules/projector/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/model/modules/projector/__pycache__/__init__.cpython-311.pyc b/code/model/modules/projector/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc821ff12206e729dc1893e00c9e8fdc600ecc11
Binary files /dev/null and b/code/model/modules/projector/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/model/modules/vlm/CosmosReason2.py b/code/model/modules/vlm/CosmosReason2.py
new file mode 100644
index 0000000000000000000000000000000000000000..034f419a374607bba2306a5475e4ddeb763dd688
--- /dev/null
+++ b/code/model/modules/vlm/CosmosReason2.py
@@ -0,0 +1,139 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Haron Wan / CUHK Shenzhen] in [2026].
+
+import torch
+import transformers
+from typing import Optional, List, Dict
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from torch.nn.utils.rnn import pad_sequence
+
+from accelerate.logging import get_logger
+
+import torch.nn as nn
+logger = get_logger(__name__)
+
+import warnings
+warnings.filterwarnings("ignore")
+
+from pathlib import Path
+
+
+ROOT = Path(__file__).parents[1]
+SEPARATOR = "-" * 20
+
+PIXELS_PER_TOKEN = 32**2
+"""Number of pixels per visual token."""
+
+
+class _CosmosReason2_Interface(nn.Module):
+    def __init__(self, config: Optional[dict] = None, **kwargs):
+        super().__init__()
+        model_name = "nvidia/Cosmos-Reason2-2B"
+        self.model = transformers.Qwen3VLForConditionalGeneration.from_pretrained(
+            model_name,
+            dtype=torch.bfloat16,
+            attn_implementation="sdpa"
+        )
+        self.processor = transformers.Qwen3VLProcessor.from_pretrained(model_name)
+        self.config = config
+
+        self.model.config.hidden_size = self.model.config.text_config.hidden_size
+
+
+    def forward(self, **kwargs, ) -> CausalLMOutputWithPast:
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(**kwargs, )
+        return outputs
+
+    def generate(self, **kwargs, ):
+        with torch.autocast("cuda", dtype=torch.float16):
+            generation_output = self.model.generate(**kwargs, )
+        return generation_output
+
+    def build_qwenvl_inputs(self, images, instructions, **kwargs):
+        messages = []
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self.config.datasets.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.datasets.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            messages.append(msg)
+
+        # Process inputs
+        inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            padding=True,
+            add_generation_prompt=True,
+            return_dict=True,
+            return_tensors="pt",
+            # fps=4,
+        )
+
+        return inputs.to(self.model.device)
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="/mnt/workspace/users/wanhanwen/JoyRA/examples/Robocasa_tabletop/train_files/starvla_cotrain_robocasa_gr1.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    
+    cfg.framework.qwenvl.base_vlm = "path/to/Cosmos-Reason2-2B"
+    cfg.framework.qwenvl.attn_implementation = "sdpa"
+    qwen_vl = _CosmosReason2_Interface(cfg)
+
+    conversation = [
+        {
+            "role": "system",
+            "content": [{"type": "text", "text": "You are a helpful assistant."}],
+        },
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "image",
+                    "image": f"path/to/sample.png",
+                },
+                {"type": "text", "text": "What is the robot most likely to do?"},
+            ],
+        },
+    ]
+
+    # Process inputs
+    inputs = qwen_vl.processor.apply_chat_template(
+        conversation,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+    )
+
+    # Run inference
+    generated_ids = qwen_vl.model.generate(**inputs, max_new_tokens=4096)
+    generated_ids_trimmed = [
+        out_ids[len(in_ids) :]
+        for in_ids, out_ids in zip(inputs.input_ids, generated_ids, strict=False)
+    ]
+    output_text = qwen_vl.processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )
+    print(SEPARATOR)
+    print(output_text[0])
+    print(SEPARATOR)
+    
+    # print(f"last_hidden: {last_hidden}")
\ No newline at end of file
diff --git a/code/model/modules/vlm/Florence2.py b/code/model/modules/vlm/Florence2.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86ff25f35dacdd7b0425826cf86d327cc97a2fd
--- /dev/null
+++ b/code/model/modules/vlm/Florence2.py
@@ -0,0 +1,203 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+import torch
+from typing import Optional, List
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from typing import Dict, Optional, List
+from typing import List, Union, Dict, Optional
+
+
+import torch
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForCausalLM 
+
+
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+# IGNORE_INDEX = -100
+# IMAGE_TOKEN_INDEX = 151655
+# VIDEO_TOKEN_INDEX = 151656
+# DEFAULT_IMAGE_TOKEN = "<image>"
+# DEFAULT_VIDEO_TOKEN = "<video>"
+
+# [151936, 153984]
+
+import torch.nn as nn
+def _construct_prompts(text):
+
+    return text
+
+class _Florence_Interface(nn.Module):
+    """
+    This exists because of the diversity of VLMs, so we encapsulate the changes here.
+    Lightweight wrapper around Qwen3-VL (Qwen3VLForConditionalGeneration).
+
+    Purpose:
+        - Unify interface with other VLM backends (CausalLM-like usage).
+        - Centralize preprocessing (tokenization + multimodal packing).
+        - Provide consistent forward / generate signatures.
+
+    """
+
+    def __init__(self, config: Optional[dict] = None, **kwargs):
+        """
+        Initialize the VLM wrapper.
+        Following https://huggingface.co/microsoft/Florence-2-large
+
+        """
+        super().__init__()
+
+        qwenvl_config = config.framework.get("qwenvl", {})
+        model_id = qwenvl_config.get("base_vlm", "microsoft/Florence-2-large")
+
+        torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
+
+        self.model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch_dtype, trust_remote_code=True, attn_implementation="eager" ) # 强制使用 eager 注意力
+        self.processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+
+        self.processor._construct_prompts = _construct_prompts
+        self.config = config
+
+        # alin with qwen2.5
+        self.model.config.hidden_size = self.model.config.projection_dim
+
+
+        # del unused moduals to save memory
+        if hasattr(self.model, "decoder"):
+            del self.model.decoder
+        if hasattr(self.model, "lm_head"):
+            del self.model.lm_head
+
+    def forward(
+        self,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass delegating to underlying Qwen2.5-VL backbone.
+        """
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.forward_vlm(
+                **kwargs,
+            )
+
+        return outputs
+
+    # ============================= Florence2 encoder =============================
+    def forward_vlm(
+        self,
+        input_ids: torch.LongTensor,        # [B, L]
+        pixel_values: torch.FloatTensor,    # [B, C, H, W] --> [B, H, W]
+        **kwargs
+    ):
+        """
+        # copyright from X-VLA https://github.com/2toinf/X-VLA/blob/main/models/modeling_florence2.py
+
+        Encode text + multi-view images via Florence2 encoder.
+        Returns:
+          enc_out.hidden_states: [B, T_enc, D]
+        """
+        # get image features
+           
+        param_dtype = next(self.model.parameters()).dtype
+        pixel_values = pixel_values.to(self.model.device, dtype=param_dtype)
+        valid_feats = self.model._encode_image(pixel_values)      # [B, N, D]
+        B_multiview, N, D = valid_feats.shape
+        # get text embeddings
+        inputs_embeds = self.model.get_input_embeddings()(input_ids)  # [B, L, D]
+
+        # # olny support single image from florence, your can modify here for multi-image support by merge each image features
+        # like pixel_values: B*N_view, C, H, W --> B*N_view, N_token, D -> B, N_view*N_token, D -> image_features
+        B, L, _ = inputs_embeds.shape
+        image_features = valid_feats.view(B, -1, D)  # [B, N_view*N, D]
+
+        # merge image features and text embeddings
+        merged_embeds, attention_mask = self.model._merge_input_ids_with_image_features(
+            image_features,  # first view: [B, N, D]
+            inputs_embeds,         # [B, L, D]
+        )
+        
+        # TODO should return text index and image index for later index masking
+        
+        enc_out = self.model.language_model.model.encoder(
+            attention_mask=attention_mask,
+            inputs_embeds=merged_embeds,
+        )
+        enc_out.hidden_states = [enc_out.last_hidden_state]
+        # last_hidden = qwenvl_outputs.hidden_states[-1]   # [B, L, H]
+        return enc_out
+    
+    def build_qwenvl_inputs(self, images, instructions, **kwargs):
+        """
+        Build model inputs from raw data (images + instructions).
+        Follow Oficial Florence 2 format: https://huggingface.co/microsoft/Florence-2-large
+        """
+
+        # Create messages: one message per sample
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        assert len(images[0]) == 1, "Florence2 only support batch size 1 for now"
+        # # # olny support single image from florence, your can modify here for multi-image support by merge each image features
+        flatten_batch_images = []
+        for exameple_images in images:
+            flatten_batch_images.extend(exameple_images)
+        # images = [image[0] for image in  images]
+        task_prompt = "Locate the objects with category name in the image." #"Locate the objects with category name in the image."
+        for index in range(len(instructions)):
+            instruction = instructions[index]
+            instructions[index] = task_prompt + " " + instruction
+        
+        # olny support single image for a text input from florence, your can modify here for multi-image support by merge each image features
+        inputs = self.processor(text=instructions, images=flatten_batch_images, return_tensors="pt", padding=True, truncation=True,)
+        inputs["labels"] = inputs["input_ids"].clone()
+
+        return inputs.to(self.model.device)
+
+
+
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    # model_id = "microsoft/Florence-2-large"
+    model_id = "playground/Pretrained_models/Florence-2-large"
+    cfg.framework.qwenvl.base_vlm = model_id
+    qwen_vl = _Florence_Interface(cfg)
+    qwen_vl.model.eval()
+
+    import requests
+
+    import torch
+    from PIL import Image
+
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    torch_dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
+
+    prompt = "<OD>"
+
+    url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true"
+    image = Image.open(requests.get(url, stream=True).raw)
+    inputs = qwen_vl.build_qwenvl_inputs(images=[[image]], instructions=[prompt])
+    with torch.no_grad():
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = qwen_vl.forward_vlm(
+                input_ids=inputs["input_ids"],
+                pixel_values=inputs["pixel_values"],
+        )
+    print(f"forward_vlm last_hidden_state shape: {outputs.last_hidden_state.shape}")
+    print(f"forward_vlm hidden_states length: {len(outputs.hidden_states)}")
+
diff --git a/code/model/modules/vlm/QWen2_5.py b/code/model/modules/vlm/QWen2_5.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2e8c8445023e0c9285e369dadf4c7d879b06ce1
--- /dev/null
+++ b/code/model/modules/vlm/QWen2_5.py
@@ -0,0 +1,315 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+import torch
+import transformers
+from typing import Optional, List
+import copy
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from typing import Dict, Optional, List
+from torch.nn.utils.rnn import pad_sequence
+from transformers import BatchFeature
+
+from qwen_vl_utils import process_vision_info
+
+
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+_ACTION_TOKEN_MIN = 151665 # how can we know this range?
+_ACTION_TOKEN_MAX = 153712 # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+
+
+import torch.nn as nn
+
+
+class _QWen_VL_Interface(nn.Module):
+    """
+    This exists because of the diversity of VLMs, so we encapsulate the changes here.
+    Lightweight wrapper around Qwen2.5-VL (Qwen2_5_VLForConditionalGeneration).
+
+    Purpose:
+        - Unify interface with other VLM backends (CausalLM-like usage).
+        - Centralize preprocessing (tokenization + multimodal packing).
+        - Provide consistent forward / generate signatures.
+
+    Notes:
+        - Keeps original model behavior; does not modify internal architecture.
+        - Mixed precision handled via torch.autocast in forward / generate.
+        - Adaptation layer can be extended for future multi-modal routing if needed.
+    """
+
+    def __init__(self, config: Optional[dict] = None, **kwargs):
+        """
+        Initialize the Qwen2.5-VL wrapper.
+
+        Parameters:
+            config (dict | Any | None):
+                Expected to expose a nested attribute/namespace `framework.get("qwenvl", {})`
+                where:
+                    framework.qwenvl.base_vlm (str): HuggingFace model id or local path.
+                Optional expected structure (illustrative):
+                    config.framework.get("qwenvl", {}) -> {
+                        "base_vlm": "Qwen/Qwen2.5-VL-3B-Instruct"
+                    }
+                    config.datasets.vla_data.get("CoT_prompt", str) may be used later in build_qwenvl_inputs.
+            **kwargs:
+                Ignored currently; placeholder for future extension (e.g., override device_map, dtype).
+
+        Side Effects:
+            - Downloads / loads pretrained Qwen2.5-VL weights (unless cached).
+            - Instantiates AutoProcessor and enforces left padding (required for some FlashAttention paths).
+
+        Attributes Set:
+            self.model (Qwen2_5_VLForConditionalGeneration)
+            self.processor (AutoProcessor)
+            self.config (original config reference)
+
+        Notes:
+            - device_map='cuda' is passed to from_pretrained (single or multi-GPU depending on HF accelerate mapping).
+            - torch_dtype='auto' lets HF decide best available (prefers bfloat16 on supported hardware).
+            - tokenizer padding_side forced to 'left' (important for generation + KV caching alignment).
+        """
+        super().__init__()
+
+        qwenvl_config = config.framework.get("qwenvl", {})
+        model_id = qwenvl_config.get("base_vlm", "Qwen/Qwen2.5-VL-3B-Instruct")
+
+        model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+            model_id,
+            attn_implementation="flash_attention_2",
+            torch_dtype="auto",
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.padding_side = "left"
+
+        self.model = model
+        self.processor = processor
+        self.config = config
+
+        self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN
+        self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        image_grid_thw: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = True,
+        return_dict: Optional[bool] = True,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass delegating to underlying Qwen2.5-VL backbone.
+
+        Args:
+            input_ids (LongTensor | None): [B, T] token ids (mutually exclusive with inputs_embeds).
+            attention_mask (Tensor | None): [B, T], 1 = attend, 0 = masked.
+            pixel_values (FloatTensor | None): Vision batch (model-specific preprocessed shape).
+            labels (LongTensor | None): [B, T] LM targets; ignored positions = -100 (IGNORE_INDEX).
+            image_grid_thw (FloatTensor | None): Optional tiling metadata (e.g., [B, 3] for temporal/height/width splits).
+            inputs_embeds (FloatTensor | None): [B, T, D] alternative embedding input.
+            past_key_values (List[FloatTensor] | None): Cached KV states for incremental decoding.
+            use_cache (bool | None): If True, returns updated past_key_values.
+            output_attentions (bool): Whether to include attention maps.
+            output_hidden_states (bool): Must be True if downstream modules consume hidden states.
+            return_dict (bool): Return HF dataclass if True; else tuple.
+            **kwargs: Extra args forwarded to underlying model.
+
+        Returns:
+            CausalLMOutputWithPast | tuple: HF-standard structure (logits, past_key_values, hidden_states, etc.).
+
+        Notes:
+            - Autocast(bfloat16) used for efficiency.
+            - padding_side already set to 'left' in tokenizer at init.
+            - Hidden states required for auxiliary alignment or feature extraction modules.
+        """
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                pixel_values=pixel_values,
+                image_grid_thw=image_grid_thw,
+                labels=labels,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                **kwargs,
+            )
+
+        return outputs
+
+    def generate(
+        self,
+        **kwargs,
+    ):
+        """
+        High-level generation interface (auto-regressive decoding), optionally vision-conditioned.
+
+        Args:
+            **kwargs: fully follow raw model.generate() signature.
+        Returns:
+            GenerateOutput | Model-dependent generation return.
+        """
+        with torch.autocast("cuda", dtype=torch.float16):
+            generation_output = self.model.generate(
+                **kwargs,
+            )
+        return generation_output
+
+    def build_qwenvl_inputs(self, images, instructions, solutions=None, **kwargs):
+        """
+        Construct and tokenize multimodal chat-style inputs for Qwen2.5-VL (batched).
+
+        Overview:
+            For each sample i:
+                - Takes a list of PIL images: images[i] = [img_0, img_1, ...]
+                - Takes a matching instruction string instructions[i]
+                - Optionally formats instruction with a chain-of-thought template (CoT_prompt) if present in config.
+                - Builds a single-turn chat message containing:
+                      [{"role": "user", "content": [
+                          {"type": "image", "image": <PIL.Image>}, ...,
+                          {"type": "text", "text": <final_prompt>}
+                      ]}]
+                - Applies processor.apply_chat_template(..., add_generation_prompt=True)
+                - Extracts vision inputs via process_vision_info
+                - Calls processor(...) to produce a BatchFeature with token + vision tensors.
+
+        Parameters:
+            images (List[List[PIL.Image.Image]]):
+                Length B. Each element is a (possibly empty) list of PIL images associated with that instruction.
+                Supports multi-image inputs (ordered). For video-as-frames, upstream code should decide packaging.
+            instructions (List[str]):
+                Length B textual prompts or task instructions.
+            **kwargs:
+                Reserved for future extensions (e.g., system prompts, style controls, additional metadata).
+
+        Config Dependencies:
+            self.config.datasets.vla_data.get("CoT_prompt", str):
+                If present, each instruction string is injected into the template by replacing "{instruction}".
+
+        Returns:
+            BatchFeature (HF):
+                Typical keys (moved to self.model.device):
+                    input_ids: LongTensor [B, T]
+                    attention_mask: LongTensor/Bool [B, T]
+                    pixel_values / image_grid / video specifics (model-dependent)
+                    (Possibly) token_type_ids or other processor outputs
+                The structure aligns with what Qwen2_5_VLForConditionalGeneration.forward expects.
+
+        Shapes / Notes:
+            - Sequence length T varies by number of images (special tokens) + prompt length.
+            - pixel_values may have internal batching distinct from B if images are flattened; underlying model maps them.
+            - The association between images and textual placeholders is preserved by processor ordering.
+
+        Edge Cases:
+            - Empty image list per sample is allowed (pure text prompt).
+            - Mismatched lengths of images and instructions raise AssertionError.
+            - CoT prompt replacement is naive string replace; ensure template contains "{instruction}" placeholder.
+
+        Performance:
+            - This path aims for faster inference vs. more granular per-turn assembly.
+            - Minor tokenization differences (e.g., whitespace) can affect highly overfitted benchmarks.
+
+        Does Not:
+            - Perform augmentation.
+            - Cache processed pixel tensors.
+            - Handle streaming input.
+
+        """
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self.config.datasets.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.datasets.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Prepare text prompts using processor
+        # default process is json --> message --> texts --> input_ids
+        texts = [self.processor.apply_chat_template(m, tokenize=False, add_generation_prompt=True) for m in messages]
+
+        # image_inputs = list of PIL
+        image_inputs, video_inputs = process_vision_info(messages)
+        batch_input = self.processor(text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt")
+
+
+        # if solutions, mask out the non solution tokens in labels --> @JinhuiYE can we mask out system prompt?
+        if solutions is not None:
+            action_token_min = _ACTION_TOKEN_MIN # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            action_token_max = _ACTION_TOKEN_MAX # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_input['input_ids'].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= action_token_min) & (seq <= action_token_max)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    first_action_index = nonzero_indices[0].item()
+                    # Mask out all tokens before the first action token.
+                    seq[:first_action_index] = IGNORE_INDEX
+                else:
+                    # If no action token is found, mask the entire sequence.
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning (f"action token are on in yout tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md.")
+            
+            labels[labels == self.processor.tokenizer.pad_token_id] = -100 ## mask out pad tokens as well
+            batch_input['labels'] = labels
+
+        return batch_input.to(self.model.device)
+
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    
+    model_id = "./playground/Pretrained_models/Qwen2.5-VL-3B-Instruct"
+    cfg.framework.qwenvl.base_vlm = model_id
+
+    model = _QWen_VL_Interface(config=cfg)
+    pass
diff --git a/code/model/modules/vlm/QWen3.py b/code/model/modules/vlm/QWen3.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fc7290d094319d3fded2b58c5db45097a7927ab
--- /dev/null
+++ b/code/model/modules/vlm/QWen3.py
@@ -0,0 +1,191 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+import torch
+from typing import Optional, List
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import Qwen3VLForConditionalGeneration, AutoProcessor
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from typing import Dict, Optional, List
+from torch.nn.utils.rnn import pad_sequence
+from transformers import BatchFeature
+
+from qwen_vl_utils import process_vision_info
+
+
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+IGNORE_INDEX = -100
+IMAGE_TOKEN_INDEX = 151655
+VIDEO_TOKEN_INDEX = 151656
+DEFAULT_IMAGE_TOKEN = "<image>"
+DEFAULT_VIDEO_TOKEN = "<video>"
+
+_ACTION_TOKEN_MIN = 151669 # how can we know this range? check how you add fast tokens into VLM
+_ACTION_TOKEN_MAX = 153716 # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+
+
+import torch.nn as nn
+
+
+class _QWen3_VL_Interface(nn.Module):
+    """
+    This exists because of the diversity of VLMs, so we encapsulate the changes here.
+    Lightweight wrapper around Qwen3-VL (Qwen3VLForConditionalGeneration).
+
+    Purpose:
+        - Unify interface with other VLM backends (CausalLM-like usage).
+        - Centralize preprocessing (tokenization + multimodal packing).
+        - Provide consistent forward / generate signatures.
+
+    """
+
+    def __init__(self, config: Optional[dict] = None, **kwargs):
+        """
+        Initialize the Qwen3-VL wrapper.
+        Following https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
+
+        """
+        super().__init__()
+
+        qwenvl_config = config.framework.get("qwenvl", {})
+        model_id = qwenvl_config.get("base_vlm", "Qwen/Qwen3-VL-4B-Instruct")
+
+        model = Qwen3VLForConditionalGeneration.from_pretrained(
+            model_id,
+            attn_implementation="flash_attention_2",
+            dtype=torch.bfloat16,
+        )
+        processor = AutoProcessor.from_pretrained(model_id)
+        processor.tokenizer.padding_side = "left"
+
+        self.model = model
+        self.processor = processor
+        self.config = config
+
+        # alin qwen3 with qwen2.5
+        self.model.config.hidden_size = self.model.config.text_config.hidden_size
+
+        # only for fast base model
+        if "-Action" in model_id:
+            self._ACTION_TOKEN_MIN = _ACTION_TOKEN_MIN
+            self._ACTION_TOKEN_MAX = _ACTION_TOKEN_MAX
+
+    def forward(
+        self,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """
+        Forward pass delegating to underlying Qwen2.5-VL backbone.
+        """
+
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            outputs = self.model(
+                **kwargs,
+            )
+
+        return outputs
+
+    def generate(
+        self,
+        **kwargs,
+    ):
+        """
+        High-level generation interface (auto-regressive decoding), optionally vision-conditioned.
+
+        Args:
+            **kwargs: fully follow raw model.generate() signature.
+        Returns:
+            GenerateOutput | Model-dependent generation return.
+        """
+        with torch.autocast("cuda", dtype=torch.float16):
+            generation_output = self.model.generate(
+                **kwargs,
+            )
+        return generation_output
+
+    def build_qwenvl_inputs(self, images, instructions, solutions=None, **kwargs):
+        """
+        Build model inputs from raw data (images + instructions + optional solutions).
+        Follow Oficial Qwen3-VL Instruct format: https://huggingface.co/Qwen/Qwen3-VL-4B-Instruct
+        """
+
+        # Create messages: one message per sample
+        messages = []
+        assert len(images) == len(instructions), "Images and instructions must have the same length"
+        for imgs, instruction in zip(images, instructions):
+            content = [{"type": "image", "image": img} for img in imgs]
+
+            if "CoT_prompt" in self.config.datasets.vla_data:  # If using a grounding prompt to task
+                CoT_prompt = self.config.datasets.vla_data.get("CoT_prompt", "")
+                prompt = CoT_prompt.replace("{instruction}", instruction)
+            else:
+                prompt = instruction
+
+            content.append({"type": "text", "text": prompt})
+            msg = [{"role": "user", "content": content}]
+
+            if solutions is not None:
+                solution = solutions[len(messages)]
+                msg.append({"role": "assistant", "content": [{"type": "text", "text": solution}]})
+            messages.append(msg)
+
+        # Preparation for inference
+
+        batch_inputs = self.processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        padding=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt"
+        )
+
+        # if solutions, mask out the solution tokens in labels
+        if solutions is not None: #  here only for fast_tokenizer now. 
+            action_token_min = _ACTION_TOKEN_MIN # how can we know this range? --> we has other way for this, but is slower see qwenhelix branch
+            action_token_max = _ACTION_TOKEN_MAX # here only for fast_tokenizer, see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md
+            labels = batch_inputs['input_ids'].clone()
+            # For each sequence in the batch, find the first occurrence of an action token.
+            for i in range(labels.size(0)):
+                seq = labels[i]
+                # Create a mask for tokens within the action token range.
+                mask_seq = (seq >= action_token_min) & (seq <= action_token_max)
+                nonzero_indices = torch.nonzero(mask_seq, as_tuple=False)
+                if nonzero_indices.numel() > 0:
+                    first_action_index = nonzero_indices[0].item()
+                    # Mask out all tokens before the first action token.
+                    seq[:first_action_index] = IGNORE_INDEX
+                else:
+                    # If no action token is found, mask the entire sequence.
+                    seq[:] = IGNORE_INDEX
+                    RuntimeWarning (f"action token are on in yout tokenizer, plz see starVLA/model/modules/vlm/tools/add_qwen_special_tokens/README.md.")
+            
+            labels[labels == self.processor.tokenizer.pad_token_id] = -100 ## mask out pad tokens as well
+            batch_inputs['labels'] = labels
+
+        return batch_inputs.to(self.model.device)
+
+
+
+
+if __name__ == "__main__":
+    from omegaconf import OmegaConf
+    import debugpy
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="./starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+    debugpy.wait_for_client()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    
+    cfg.framework.qwenvl.base_vlm = "./playground/Pretrained_models/Qwen3-VL-4B-Instruct"
+    qwen_vl = _QWen3_VL_Interface(cfg)
+    pass
diff --git a/code/model/modules/vlm/__init__.py b/code/model/modules/vlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..23c3062d3e9fedb0224a20673c0cdeb9561de33d
--- /dev/null
+++ b/code/model/modules/vlm/__init__.py
@@ -0,0 +1,25 @@
+
+
+
+def get_vlm_model(config):
+
+    vlm_name = config.framework.qwenvl.base_vlm
+
+    if "Qwen2.5-VL" in vlm_name or "nora" in vlm_name.lower(): # temp for some ckpt
+        from .QWen2_5 import _QWen_VL_Interface 
+        return _QWen_VL_Interface(config)
+    elif "Qwen3-VL" in vlm_name:
+        from .QWen3 import _QWen3_VL_Interface
+
+        return _QWen3_VL_Interface(config)
+    elif "florence" in vlm_name.lower(): # temp for some ckpt
+        from .Florence2 import _Florence_Interface 
+        return _Florence_Interface(config)
+    elif "cosmos-reason2" in vlm_name.lower():
+        from .CosmosReason2 import _CosmosReason2_Interface
+        return _CosmosReason2_Interface(config)
+    else:
+        raise NotImplementedError(f"VLM model {vlm_name} not implemented")
+
+
+
diff --git a/code/model/modules/vlm/__pycache__/QWen3.cpython-310.pyc b/code/model/modules/vlm/__pycache__/QWen3.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..229b33254a546b0fc79757808b2a48b283730850
Binary files /dev/null and b/code/model/modules/vlm/__pycache__/QWen3.cpython-310.pyc differ
diff --git a/code/model/modules/vlm/__pycache__/QWen3.cpython-311.pyc b/code/model/modules/vlm/__pycache__/QWen3.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9384e7290b0f69e2265e88a687b289a189f6d95b
Binary files /dev/null and b/code/model/modules/vlm/__pycache__/QWen3.cpython-311.pyc differ
diff --git a/code/model/modules/vlm/__pycache__/__init__.cpython-310.pyc b/code/model/modules/vlm/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72d4620845f8f810c93b6ceb00e93064d358de52
Binary files /dev/null and b/code/model/modules/vlm/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/model/modules/vlm/__pycache__/__init__.cpython-311.pyc b/code/model/modules/vlm/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d082658c20135744f29831c9a584017c5adebd4
Binary files /dev/null and b/code/model/modules/vlm/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/model/modules/vlm/tools/add_qwen_special_tokens/README.md b/code/model/modules/vlm/tools/add_qwen_special_tokens/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b15f8079c93e46af3d308b30809dab9e939a2318
--- /dev/null
+++ b/code/model/modules/vlm/tools/add_qwen_special_tokens/README.md
@@ -0,0 +1,58 @@
+# Qwen Special Token Addition Script
+
+Quickly add new special tokens to Qwen/Qwen2.5-VL-3B-Instruct (or compatible models) and save them to a locally loadable directory.
+
+## 运行
+
+```bash
+
+
+source_model_id=playground/Pretrained_models/Qwen3-VL-4B-Instruct-Fang
+target_model_id=playground/Pretrained_models/Qwen3-VL-4B-Instruct-Action
+fast_token_list=starVLA/model/modules/vlm/tools/add_qwen_special_tokens/fast_tokens.txt
+
+python starVLA/model/modules/vlm/tools/add_qwen_special_tokens/add_special_tokens_to_qwen.py \
+  --model-id ${source_model_id} \
+  --tokens-file ${fast_token_list} \
+  --save-dir ${target_model_id} \
+  --init-strategy normal
+  
+```
+
+`tokens.txt` example:
+```
+<loc_x>
+<loc_y>
+<bbox_start>
+<bbox_end>
+```
+
+ 
+## Arguments
+ 
+- --model-id: HF Hub model ID or an existing local model directory
+- --save-dir: Output directory
+- --tokens-file
+- --init-strategy: avg / normal / zero
+- --as-special / --no-as-special: Add as special tokens or regular tokens
+- --padding-side: left / right
+- --device: cpu / cuda / mps / auto
+
+ 
+## Results
+ 
+The saved directory contains:
+ 
+- config.json / model.safetensors / tokenizer.*
+- added_token_id_map.json (records the mapping from newly added tokens to IDs)
+
+ 
+ 
+## Load
+ 
+```python
+from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration
+tok = AutoTokenizer.from_pretrained("./qwen_vl_with_spatial", trust_remote_code=True)
+model = Qwen2_5_VLForConditionalGeneration.from_pretrained("./qwen_vl_with_spatial", torch_dtype="auto", trust_remote_code=True)
+print(tok.convert_tokens_to_ids("<loc_x>"))
+```
diff --git a/code/model/modules/vlm/tools/add_qwen_special_tokens/add_special_tokens_to_qwen.py b/code/model/modules/vlm/tools/add_qwen_special_tokens/add_special_tokens_to_qwen.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e0f628c314bd75012bddb8a1be3dd73f58ee0e8
--- /dev/null
+++ b/code/model/modules/vlm/tools/add_qwen_special_tokens/add_special_tokens_to_qwen.py
@@ -0,0 +1,214 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+
+
+import argparse
+import json
+import os
+from typing import List, Dict, Tuple
+
+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, Qwen2_5_VLForConditionalGeneration, AutoProcessor
+from transformers import Qwen3VLForConditionalGeneration
+
+def add_new_tokens(
+    model,
+    tokenizer,
+    new_tokens: List[str],
+    init_strategy: str = "avg",
+    as_special: bool = True,
+        ) -> Tuple[Dict[str, int], int, int, int]:
+    """
+    Add new tokens into the model and tokenizer (if they don't already exist).
+    init_strategy: avg / normal / zero
+    Returns:
+      - mapping: token_id mapping for all target tokens
+      - added_now: number of tokens actually added to the tokenizer this time
+      - action_token_start_idx: start index of newly added embeddings (based on old embedding size)
+      - action_token_end_idx: end index of newly added embeddings (if none added, equals start_idx - 1)
+    Notes:
+      - tokenizer.vocab_size is the base vocabulary size (excluding already added special/added tokens)
+      - len(tokenizer) is the total vocabulary size (including added/special tokens)
+      - The old embedding size of the model is model.get_input_embeddings().weight.shape[0]
+    """
+    # 1) Compute tokens to add (relative to current tokenizer vocab)
+    vocab = tokenizer.get_vocab()  # includes existing special tokens
+    to_add_tokens = [t for t in new_tokens if t not in vocab]
+
+    # 2) Record current embedding size of the model (base size)
+    old_embed = model.get_input_embeddings()
+    old_embed_size = old_embed.weight.shape[0]  # includes Qwen reserved tokens
+
+    # 3) If needed, add tokens into tokenizer first
+    added_now = 0
+    if to_add_tokens:
+        if as_special:
+            added_now = tokenizer.add_special_tokens({"additional_special_tokens": to_add_tokens})
+        else:
+            added_now = tokenizer.add_tokens(to_add_tokens)
+
+    # 4) Target total size (base + newly added)
+    # target_size = len(tokenizer) # total vocab --> whether to keep previously reserved empty tokens?
+    target_size = old_embed_size + added_now
+    # 5) If tokenizer total size exceeds model embedding size, resize and init new rows
+    action_token_start_idx = old_embed_size  # no-reserve plan here
+    action_token_end_idx = old_embed_size - 1  # default: "no additions"
+    if target_size > old_embed_size:
+        model.resize_token_embeddings(target_size)  # resizing to target size
+        new_embed = model.get_input_embeddings()
+        with torch.no_grad():
+            if init_strategy == "avg":
+                ref_vec = old_embed.weight.mean(dim=0, keepdim=True)
+                for idx in range(old_embed_size, target_size):
+                    new_embed.weight[idx].copy_(ref_vec[0])
+            elif init_strategy == "zero":
+                for idx in range(old_embed_size, target_size):
+                    new_embed.weight[idx].zero_()
+            elif init_strategy == "normal":
+                for idx in range(old_embed_size, target_size):
+                    nn.init.normal_(new_embed.weight[idx], mean=0.0, std=0.02)
+            else:
+                raise ValueError(f"Unknown init_strategy: {init_strategy}")
+
+        action_token_end_idx = target_size - 1
+
+    # 6) Build mapping (return ids for requested tokens)
+    mapping = {t: tokenizer.convert_tokens_to_ids(t) for t in new_tokens}
+    return mapping, added_now, action_token_start_idx, action_token_end_idx
+
+def save_bundle(model, tokenizer, mapping: Dict[str, int], save_dir: str, processor_src: str | None = None, padding_side: str | None = None):
+    os.makedirs(save_dir, exist_ok=True)
+    model.save_pretrained(save_dir)
+    tokenizer.save_pretrained(save_dir)
+    with open(os.path.join(save_dir, "added_custom_token_id_map.json"), "w", encoding="utf-8") as f:
+        json.dump(mapping, f, ensure_ascii=False, indent=2)
+    print(f"[OK] Saved to: {save_dir}")
+
+    # Additionally save AutoProcessor (generate preprocessor_config.json) so AutoProcessor.from_pretrained(...) can load
+    try:
+        src = processor_src or save_dir
+        processor = AutoProcessor.from_pretrained(src, trust_remote_code=True)
+        # Sync processor.tokenizer 
+        processor.tokenizer = tokenizer
+        processor.save_pretrained(save_dir)
+        print(f"[OK] AutoProcessor saved to: {save_dir}")
+    except Exception as e:
+        print(f"[WARN] Failed to save AutoProcessor: {e}")
+
+def reload_and_check(save_dir: str, tokens: List[str]) -> bool:
+    tok = AutoTokenizer.from_pretrained(save_dir, trust_remote_code=True)
+    vocab = tok.get_vocab()
+    missing = [t for t in tokens if t not in vocab]
+    if missing:
+        print(f"[WARN] Still missing after reload: {missing}")
+        return False
+    print("[OK] Reload check passed, all tokens exist.")
+    return True
+
+def parse_tokens(args) -> List[str]:
+    tokens: List[str] = []
+    if args.tokens:
+        tokens.extend([t.strip() for t in args.tokens.split(",") if t.strip()])
+    if args.tokens_file:
+        with open(args.tokens_file, "r", encoding="utf-8") as f:
+            for line in f:
+                line = line.strip()
+                if line:
+                    tokens.append(line)
+    # De-duplicate while keeping order
+    seen = set()
+    ordered = []
+    for t in tokens:
+        if t not in seen:
+            seen.add(t)
+            ordered.append(t)
+    return ordered
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Add special tokens to Qwen2.5-VL model and save to local directory."
+    )
+    parser.add_argument("--model-id", default="Qwen/Qwen2.5-VL-3B-Instruct", help="HF Hub model ID or local path")
+    parser.add_argument("--save-dir", required=True, help="Output directory to save")
+    parser.add_argument("--tokens", default="", help="Comma-separated tokens, e.g., <loc_x>,<loc_y>")
+    parser.add_argument("--tokens-file", help="Text file containing tokens to add (one per line)")
+    parser.add_argument("--init-strategy", default="avg", choices=["avg", "normal", "zero"], help="Initialization strategy for newly added embeddings")
+    parser.add_argument("--as-special", action="store_true", help="Whether to add as special tokens")
+    parser.add_argument("--no-as-special", dest="as_special", action="store_false")
+    parser.set_defaults(as_special=True)
+    parser.add_argument("--padding-side", default="left", choices=["left", "right"])
+    parser.add_argument("--device", default="cuda", help="cuda / cpu / mps / auto")
+    args = parser.parse_args()
+
+    tokens = parse_tokens(args)
+    if not tokens:
+        print("No tokens provided, use --tokens or --tokens-file")
+        return
+
+    print(f"[INFO] Tokens to process: {tokens}")
+
+    print(f"[INFO] Loading model: {args.model_id}")
+    tokenizer = AutoTokenizer.from_pretrained(args.model_id, trust_remote_code=True)
+    tokenizer.padding_side = args.padding_side
+    # model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+    #     args.model_id,
+    #     torch_dtype="auto",
+    #     device_map="auto" if args.device == "auto" else None,
+    #     trust_remote_code=True,
+    # )
+
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
+        args.model_id,
+        attn_implementation="flash_attention_2",
+        dtype=torch.bfloat16,
+        device_map="cuda",
+    )
+    processor = AutoProcessor.from_pretrained(args.model_id, trust_remote_code=True)
+    processor.tokenizer.padding_side = "left"
+
+
+    # Print sizes for diagnosis
+    base_tok_size = tokenizer.vocab_size                  # base vocab size
+    total_tok_size = len(tokenizer)                       # total vocab size
+    model_embed_size = model.get_input_embeddings().weight.shape[0]  # current model embedding size
+    print(f"[DEBUG] tokenizer.vocab_size(base) = {base_tok_size}")
+    print(f"[DEBUG] len(tokenizer)(total)     = {total_tok_size}")
+    print(f"[DEBUG] model.embed_size(before)  = {model_embed_size}")
+    print(f"[DEBUG] added_in_tokenizer        = {total_tok_size - base_tok_size}")
+
+    mapping, added, action_token_start_idx, action_token_end_idx = add_new_tokens(
+        model=model,
+        tokenizer=tokenizer,
+        new_tokens=tokens,
+        init_strategy=args.init_strategy,
+        as_special=args.as_special,
+    )
+    new_model_embed_size = model.get_input_embeddings().weight.shape[0]
+
+    save_bundle(model, tokenizer, mapping, args.save_dir, processor_src=args.model_id, padding_side=args.padding_side)
+
+    # Re-validate
+    reload_and_check(args.save_dir, tokens)
+
+    print(f"[INFO] Newly added to tokenizer: {added}")
+    # print(f"[INFO] Token mapping: {mapping}")
+    print(f"[INFO] Action token idx range: [{action_token_start_idx}, {action_token_end_idx}]")
+    print(f"[DEBUG] model.embed_size(after)   = {new_model_embed_size}")
+
+
+
+def start_debugpy_once():
+    """start debugpy once"""
+    import debugpy
+    if getattr(start_debugpy_once, "_started", False):
+        return
+    debugpy.listen(("0.0.0.0", 10092))
+    print("🔍 Waiting for VSCode attach on 0.0.0.0:10092 ...")
+    debugpy.wait_for_client()
+    start_debugpy_once._started = True
+
+if __name__ == "__main__":
+    start_debugpy_once()
+    main()
diff --git a/code/model/modules/vlm/tools/add_qwen_special_tokens/fast_tokens.txt b/code/model/modules/vlm/tools/add_qwen_special_tokens/fast_tokens.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb0dbb4664954b59a25cdfdcd9f0e57044b57899
--- /dev/null
+++ b/code/model/modules/vlm/tools/add_qwen_special_tokens/fast_tokens.txt
@@ -0,0 +1,2048 @@
+<robot_action_0>
+<robot_action_1>
+<robot_action_2>
+<robot_action_3>
+<robot_action_4>
+<robot_action_5>
+<robot_action_6>
+<robot_action_7>
+<robot_action_8>
+<robot_action_9>
+<robot_action_10>
+<robot_action_11>
+<robot_action_12>
+<robot_action_13>
+<robot_action_14>
+<robot_action_15>
+<robot_action_16>
+<robot_action_17>
+<robot_action_18>
+<robot_action_19>
+<robot_action_20>
+<robot_action_21>
+<robot_action_22>
+<robot_action_23>
+<robot_action_24>
+<robot_action_25>
+<robot_action_26>
+<robot_action_27>
+<robot_action_28>
+<robot_action_29>
+<robot_action_30>
+<robot_action_31>
+<robot_action_32>
+<robot_action_33>
+<robot_action_34>
+<robot_action_35>
+<robot_action_36>
+<robot_action_37>
+<robot_action_38>
+<robot_action_39>
+<robot_action_40>
+<robot_action_41>
+<robot_action_42>
+<robot_action_43>
+<robot_action_44>
+<robot_action_45>
+<robot_action_46>
+<robot_action_47>
+<robot_action_48>
+<robot_action_49>
+<robot_action_50>
+<robot_action_51>
+<robot_action_52>
+<robot_action_53>
+<robot_action_54>
+<robot_action_55>
+<robot_action_56>
+<robot_action_57>
+<robot_action_58>
+<robot_action_59>
+<robot_action_60>
+<robot_action_61>
+<robot_action_62>
+<robot_action_63>
+<robot_action_64>
+<robot_action_65>
+<robot_action_66>
+<robot_action_67>
+<robot_action_68>
+<robot_action_69>
+<robot_action_70>
+<robot_action_71>
+<robot_action_72>
+<robot_action_73>
+<robot_action_74>
+<robot_action_75>
+<robot_action_76>
+<robot_action_77>
+<robot_action_78>
+<robot_action_79>
+<robot_action_80>
+<robot_action_81>
+<robot_action_82>
+<robot_action_83>
+<robot_action_84>
+<robot_action_85>
+<robot_action_86>
+<robot_action_87>
+<robot_action_88>
+<robot_action_89>
+<robot_action_90>
+<robot_action_91>
+<robot_action_92>
+<robot_action_93>
+<robot_action_94>
+<robot_action_95>
+<robot_action_96>
+<robot_action_97>
+<robot_action_98>
+<robot_action_99>
+<robot_action_100>
+<robot_action_101>
+<robot_action_102>
+<robot_action_103>
+<robot_action_104>
+<robot_action_105>
+<robot_action_106>
+<robot_action_107>
+<robot_action_108>
+<robot_action_109>
+<robot_action_110>
+<robot_action_111>
+<robot_action_112>
+<robot_action_113>
+<robot_action_114>
+<robot_action_115>
+<robot_action_116>
+<robot_action_117>
+<robot_action_118>
+<robot_action_119>
+<robot_action_120>
+<robot_action_121>
+<robot_action_122>
+<robot_action_123>
+<robot_action_124>
+<robot_action_125>
+<robot_action_126>
+<robot_action_127>
+<robot_action_128>
+<robot_action_129>
+<robot_action_130>
+<robot_action_131>
+<robot_action_132>
+<robot_action_133>
+<robot_action_134>
+<robot_action_135>
+<robot_action_136>
+<robot_action_137>
+<robot_action_138>
+<robot_action_139>
+<robot_action_140>
+<robot_action_141>
+<robot_action_142>
+<robot_action_143>
+<robot_action_144>
+<robot_action_145>
+<robot_action_146>
+<robot_action_147>
+<robot_action_148>
+<robot_action_149>
+<robot_action_150>
+<robot_action_151>
+<robot_action_152>
+<robot_action_153>
+<robot_action_154>
+<robot_action_155>
+<robot_action_156>
+<robot_action_157>
+<robot_action_158>
+<robot_action_159>
+<robot_action_160>
+<robot_action_161>
+<robot_action_162>
+<robot_action_163>
+<robot_action_164>
+<robot_action_165>
+<robot_action_166>
+<robot_action_167>
+<robot_action_168>
+<robot_action_169>
+<robot_action_170>
+<robot_action_171>
+<robot_action_172>
+<robot_action_173>
+<robot_action_174>
+<robot_action_175>
+<robot_action_176>
+<robot_action_177>
+<robot_action_178>
+<robot_action_179>
+<robot_action_180>
+<robot_action_181>
+<robot_action_182>
+<robot_action_183>
+<robot_action_184>
+<robot_action_185>
+<robot_action_186>
+<robot_action_187>
+<robot_action_188>
+<robot_action_189>
+<robot_action_190>
+<robot_action_191>
+<robot_action_192>
+<robot_action_193>
+<robot_action_194>
+<robot_action_195>
+<robot_action_196>
+<robot_action_197>
+<robot_action_198>
+<robot_action_199>
+<robot_action_200>
+<robot_action_201>
+<robot_action_202>
+<robot_action_203>
+<robot_action_204>
+<robot_action_205>
+<robot_action_206>
+<robot_action_207>
+<robot_action_208>
+<robot_action_209>
+<robot_action_210>
+<robot_action_211>
+<robot_action_212>
+<robot_action_213>
+<robot_action_214>
+<robot_action_215>
+<robot_action_216>
+<robot_action_217>
+<robot_action_218>
+<robot_action_219>
+<robot_action_220>
+<robot_action_221>
+<robot_action_222>
+<robot_action_223>
+<robot_action_224>
+<robot_action_225>
+<robot_action_226>
+<robot_action_227>
+<robot_action_228>
+<robot_action_229>
+<robot_action_230>
+<robot_action_231>
+<robot_action_232>
+<robot_action_233>
+<robot_action_234>
+<robot_action_235>
+<robot_action_236>
+<robot_action_237>
+<robot_action_238>
+<robot_action_239>
+<robot_action_240>
+<robot_action_241>
+<robot_action_242>
+<robot_action_243>
+<robot_action_244>
+<robot_action_245>
+<robot_action_246>
+<robot_action_247>
+<robot_action_248>
+<robot_action_249>
+<robot_action_250>
+<robot_action_251>
+<robot_action_252>
+<robot_action_253>
+<robot_action_254>
+<robot_action_255>
+<robot_action_256>
+<robot_action_257>
+<robot_action_258>
+<robot_action_259>
+<robot_action_260>
+<robot_action_261>
+<robot_action_262>
+<robot_action_263>
+<robot_action_264>
+<robot_action_265>
+<robot_action_266>
+<robot_action_267>
+<robot_action_268>
+<robot_action_269>
+<robot_action_270>
+<robot_action_271>
+<robot_action_272>
+<robot_action_273>
+<robot_action_274>
+<robot_action_275>
+<robot_action_276>
+<robot_action_277>
+<robot_action_278>
+<robot_action_279>
+<robot_action_280>
+<robot_action_281>
+<robot_action_282>
+<robot_action_283>
+<robot_action_284>
+<robot_action_285>
+<robot_action_286>
+<robot_action_287>
+<robot_action_288>
+<robot_action_289>
+<robot_action_290>
+<robot_action_291>
+<robot_action_292>
+<robot_action_293>
+<robot_action_294>
+<robot_action_295>
+<robot_action_296>
+<robot_action_297>
+<robot_action_298>
+<robot_action_299>
+<robot_action_300>
+<robot_action_301>
+<robot_action_302>
+<robot_action_303>
+<robot_action_304>
+<robot_action_305>
+<robot_action_306>
+<robot_action_307>
+<robot_action_308>
+<robot_action_309>
+<robot_action_310>
+<robot_action_311>
+<robot_action_312>
+<robot_action_313>
+<robot_action_314>
+<robot_action_315>
+<robot_action_316>
+<robot_action_317>
+<robot_action_318>
+<robot_action_319>
+<robot_action_320>
+<robot_action_321>
+<robot_action_322>
+<robot_action_323>
+<robot_action_324>
+<robot_action_325>
+<robot_action_326>
+<robot_action_327>
+<robot_action_328>
+<robot_action_329>
+<robot_action_330>
+<robot_action_331>
+<robot_action_332>
+<robot_action_333>
+<robot_action_334>
+<robot_action_335>
+<robot_action_336>
+<robot_action_337>
+<robot_action_338>
+<robot_action_339>
+<robot_action_340>
+<robot_action_341>
+<robot_action_342>
+<robot_action_343>
+<robot_action_344>
+<robot_action_345>
+<robot_action_346>
+<robot_action_347>
+<robot_action_348>
+<robot_action_349>
+<robot_action_350>
+<robot_action_351>
+<robot_action_352>
+<robot_action_353>
+<robot_action_354>
+<robot_action_355>
+<robot_action_356>
+<robot_action_357>
+<robot_action_358>
+<robot_action_359>
+<robot_action_360>
+<robot_action_361>
+<robot_action_362>
+<robot_action_363>
+<robot_action_364>
+<robot_action_365>
+<robot_action_366>
+<robot_action_367>
+<robot_action_368>
+<robot_action_369>
+<robot_action_370>
+<robot_action_371>
+<robot_action_372>
+<robot_action_373>
+<robot_action_374>
+<robot_action_375>
+<robot_action_376>
+<robot_action_377>
+<robot_action_378>
+<robot_action_379>
+<robot_action_380>
+<robot_action_381>
+<robot_action_382>
+<robot_action_383>
+<robot_action_384>
+<robot_action_385>
+<robot_action_386>
+<robot_action_387>
+<robot_action_388>
+<robot_action_389>
+<robot_action_390>
+<robot_action_391>
+<robot_action_392>
+<robot_action_393>
+<robot_action_394>
+<robot_action_395>
+<robot_action_396>
+<robot_action_397>
+<robot_action_398>
+<robot_action_399>
+<robot_action_400>
+<robot_action_401>
+<robot_action_402>
+<robot_action_403>
+<robot_action_404>
+<robot_action_405>
+<robot_action_406>
+<robot_action_407>
+<robot_action_408>
+<robot_action_409>
+<robot_action_410>
+<robot_action_411>
+<robot_action_412>
+<robot_action_413>
+<robot_action_414>
+<robot_action_415>
+<robot_action_416>
+<robot_action_417>
+<robot_action_418>
+<robot_action_419>
+<robot_action_420>
+<robot_action_421>
+<robot_action_422>
+<robot_action_423>
+<robot_action_424>
+<robot_action_425>
+<robot_action_426>
+<robot_action_427>
+<robot_action_428>
+<robot_action_429>
+<robot_action_430>
+<robot_action_431>
+<robot_action_432>
+<robot_action_433>
+<robot_action_434>
+<robot_action_435>
+<robot_action_436>
+<robot_action_437>
+<robot_action_438>
+<robot_action_439>
+<robot_action_440>
+<robot_action_441>
+<robot_action_442>
+<robot_action_443>
+<robot_action_444>
+<robot_action_445>
+<robot_action_446>
+<robot_action_447>
+<robot_action_448>
+<robot_action_449>
+<robot_action_450>
+<robot_action_451>
+<robot_action_452>
+<robot_action_453>
+<robot_action_454>
+<robot_action_455>
+<robot_action_456>
+<robot_action_457>
+<robot_action_458>
+<robot_action_459>
+<robot_action_460>
+<robot_action_461>
+<robot_action_462>
+<robot_action_463>
+<robot_action_464>
+<robot_action_465>
+<robot_action_466>
+<robot_action_467>
+<robot_action_468>
+<robot_action_469>
+<robot_action_470>
+<robot_action_471>
+<robot_action_472>
+<robot_action_473>
+<robot_action_474>
+<robot_action_475>
+<robot_action_476>
+<robot_action_477>
+<robot_action_478>
+<robot_action_479>
+<robot_action_480>
+<robot_action_481>
+<robot_action_482>
+<robot_action_483>
+<robot_action_484>
+<robot_action_485>
+<robot_action_486>
+<robot_action_487>
+<robot_action_488>
+<robot_action_489>
+<robot_action_490>
+<robot_action_491>
+<robot_action_492>
+<robot_action_493>
+<robot_action_494>
+<robot_action_495>
+<robot_action_496>
+<robot_action_497>
+<robot_action_498>
+<robot_action_499>
+<robot_action_500>
+<robot_action_501>
+<robot_action_502>
+<robot_action_503>
+<robot_action_504>
+<robot_action_505>
+<robot_action_506>
+<robot_action_507>
+<robot_action_508>
+<robot_action_509>
+<robot_action_510>
+<robot_action_511>
+<robot_action_512>
+<robot_action_513>
+<robot_action_514>
+<robot_action_515>
+<robot_action_516>
+<robot_action_517>
+<robot_action_518>
+<robot_action_519>
+<robot_action_520>
+<robot_action_521>
+<robot_action_522>
+<robot_action_523>
+<robot_action_524>
+<robot_action_525>
+<robot_action_526>
+<robot_action_527>
+<robot_action_528>
+<robot_action_529>
+<robot_action_530>
+<robot_action_531>
+<robot_action_532>
+<robot_action_533>
+<robot_action_534>
+<robot_action_535>
+<robot_action_536>
+<robot_action_537>
+<robot_action_538>
+<robot_action_539>
+<robot_action_540>
+<robot_action_541>
+<robot_action_542>
+<robot_action_543>
+<robot_action_544>
+<robot_action_545>
+<robot_action_546>
+<robot_action_547>
+<robot_action_548>
+<robot_action_549>
+<robot_action_550>
+<robot_action_551>
+<robot_action_552>
+<robot_action_553>
+<robot_action_554>
+<robot_action_555>
+<robot_action_556>
+<robot_action_557>
+<robot_action_558>
+<robot_action_559>
+<robot_action_560>
+<robot_action_561>
+<robot_action_562>
+<robot_action_563>
+<robot_action_564>
+<robot_action_565>
+<robot_action_566>
+<robot_action_567>
+<robot_action_568>
+<robot_action_569>
+<robot_action_570>
+<robot_action_571>
+<robot_action_572>
+<robot_action_573>
+<robot_action_574>
+<robot_action_575>
+<robot_action_576>
+<robot_action_577>
+<robot_action_578>
+<robot_action_579>
+<robot_action_580>
+<robot_action_581>
+<robot_action_582>
+<robot_action_583>
+<robot_action_584>
+<robot_action_585>
+<robot_action_586>
+<robot_action_587>
+<robot_action_588>
+<robot_action_589>
+<robot_action_590>
+<robot_action_591>
+<robot_action_592>
+<robot_action_593>
+<robot_action_594>
+<robot_action_595>
+<robot_action_596>
+<robot_action_597>
+<robot_action_598>
+<robot_action_599>
+<robot_action_600>
+<robot_action_601>
+<robot_action_602>
+<robot_action_603>
+<robot_action_604>
+<robot_action_605>
+<robot_action_606>
+<robot_action_607>
+<robot_action_608>
+<robot_action_609>
+<robot_action_610>
+<robot_action_611>
+<robot_action_612>
+<robot_action_613>
+<robot_action_614>
+<robot_action_615>
+<robot_action_616>
+<robot_action_617>
+<robot_action_618>
+<robot_action_619>
+<robot_action_620>
+<robot_action_621>
+<robot_action_622>
+<robot_action_623>
+<robot_action_624>
+<robot_action_625>
+<robot_action_626>
+<robot_action_627>
+<robot_action_628>
+<robot_action_629>
+<robot_action_630>
+<robot_action_631>
+<robot_action_632>
+<robot_action_633>
+<robot_action_634>
+<robot_action_635>
+<robot_action_636>
+<robot_action_637>
+<robot_action_638>
+<robot_action_639>
+<robot_action_640>
+<robot_action_641>
+<robot_action_642>
+<robot_action_643>
+<robot_action_644>
+<robot_action_645>
+<robot_action_646>
+<robot_action_647>
+<robot_action_648>
+<robot_action_649>
+<robot_action_650>
+<robot_action_651>
+<robot_action_652>
+<robot_action_653>
+<robot_action_654>
+<robot_action_655>
+<robot_action_656>
+<robot_action_657>
+<robot_action_658>
+<robot_action_659>
+<robot_action_660>
+<robot_action_661>
+<robot_action_662>
+<robot_action_663>
+<robot_action_664>
+<robot_action_665>
+<robot_action_666>
+<robot_action_667>
+<robot_action_668>
+<robot_action_669>
+<robot_action_670>
+<robot_action_671>
+<robot_action_672>
+<robot_action_673>
+<robot_action_674>
+<robot_action_675>
+<robot_action_676>
+<robot_action_677>
+<robot_action_678>
+<robot_action_679>
+<robot_action_680>
+<robot_action_681>
+<robot_action_682>
+<robot_action_683>
+<robot_action_684>
+<robot_action_685>
+<robot_action_686>
+<robot_action_687>
+<robot_action_688>
+<robot_action_689>
+<robot_action_690>
+<robot_action_691>
+<robot_action_692>
+<robot_action_693>
+<robot_action_694>
+<robot_action_695>
+<robot_action_696>
+<robot_action_697>
+<robot_action_698>
+<robot_action_699>
+<robot_action_700>
+<robot_action_701>
+<robot_action_702>
+<robot_action_703>
+<robot_action_704>
+<robot_action_705>
+<robot_action_706>
+<robot_action_707>
+<robot_action_708>
+<robot_action_709>
+<robot_action_710>
+<robot_action_711>
+<robot_action_712>
+<robot_action_713>
+<robot_action_714>
+<robot_action_715>
+<robot_action_716>
+<robot_action_717>
+<robot_action_718>
+<robot_action_719>
+<robot_action_720>
+<robot_action_721>
+<robot_action_722>
+<robot_action_723>
+<robot_action_724>
+<robot_action_725>
+<robot_action_726>
+<robot_action_727>
+<robot_action_728>
+<robot_action_729>
+<robot_action_730>
+<robot_action_731>
+<robot_action_732>
+<robot_action_733>
+<robot_action_734>
+<robot_action_735>
+<robot_action_736>
+<robot_action_737>
+<robot_action_738>
+<robot_action_739>
+<robot_action_740>
+<robot_action_741>
+<robot_action_742>
+<robot_action_743>
+<robot_action_744>
+<robot_action_745>
+<robot_action_746>
+<robot_action_747>
+<robot_action_748>
+<robot_action_749>
+<robot_action_750>
+<robot_action_751>
+<robot_action_752>
+<robot_action_753>
+<robot_action_754>
+<robot_action_755>
+<robot_action_756>
+<robot_action_757>
+<robot_action_758>
+<robot_action_759>
+<robot_action_760>
+<robot_action_761>
+<robot_action_762>
+<robot_action_763>
+<robot_action_764>
+<robot_action_765>
+<robot_action_766>
+<robot_action_767>
+<robot_action_768>
+<robot_action_769>
+<robot_action_770>
+<robot_action_771>
+<robot_action_772>
+<robot_action_773>
+<robot_action_774>
+<robot_action_775>
+<robot_action_776>
+<robot_action_777>
+<robot_action_778>
+<robot_action_779>
+<robot_action_780>
+<robot_action_781>
+<robot_action_782>
+<robot_action_783>
+<robot_action_784>
+<robot_action_785>
+<robot_action_786>
+<robot_action_787>
+<robot_action_788>
+<robot_action_789>
+<robot_action_790>
+<robot_action_791>
+<robot_action_792>
+<robot_action_793>
+<robot_action_794>
+<robot_action_795>
+<robot_action_796>
+<robot_action_797>
+<robot_action_798>
+<robot_action_799>
+<robot_action_800>
+<robot_action_801>
+<robot_action_802>
+<robot_action_803>
+<robot_action_804>
+<robot_action_805>
+<robot_action_806>
+<robot_action_807>
+<robot_action_808>
+<robot_action_809>
+<robot_action_810>
+<robot_action_811>
+<robot_action_812>
+<robot_action_813>
+<robot_action_814>
+<robot_action_815>
+<robot_action_816>
+<robot_action_817>
+<robot_action_818>
+<robot_action_819>
+<robot_action_820>
+<robot_action_821>
+<robot_action_822>
+<robot_action_823>
+<robot_action_824>
+<robot_action_825>
+<robot_action_826>
+<robot_action_827>
+<robot_action_828>
+<robot_action_829>
+<robot_action_830>
+<robot_action_831>
+<robot_action_832>
+<robot_action_833>
+<robot_action_834>
+<robot_action_835>
+<robot_action_836>
+<robot_action_837>
+<robot_action_838>
+<robot_action_839>
+<robot_action_840>
+<robot_action_841>
+<robot_action_842>
+<robot_action_843>
+<robot_action_844>
+<robot_action_845>
+<robot_action_846>
+<robot_action_847>
+<robot_action_848>
+<robot_action_849>
+<robot_action_850>
+<robot_action_851>
+<robot_action_852>
+<robot_action_853>
+<robot_action_854>
+<robot_action_855>
+<robot_action_856>
+<robot_action_857>
+<robot_action_858>
+<robot_action_859>
+<robot_action_860>
+<robot_action_861>
+<robot_action_862>
+<robot_action_863>
+<robot_action_864>
+<robot_action_865>
+<robot_action_866>
+<robot_action_867>
+<robot_action_868>
+<robot_action_869>
+<robot_action_870>
+<robot_action_871>
+<robot_action_872>
+<robot_action_873>
+<robot_action_874>
+<robot_action_875>
+<robot_action_876>
+<robot_action_877>
+<robot_action_878>
+<robot_action_879>
+<robot_action_880>
+<robot_action_881>
+<robot_action_882>
+<robot_action_883>
+<robot_action_884>
+<robot_action_885>
+<robot_action_886>
+<robot_action_887>
+<robot_action_888>
+<robot_action_889>
+<robot_action_890>
+<robot_action_891>
+<robot_action_892>
+<robot_action_893>
+<robot_action_894>
+<robot_action_895>
+<robot_action_896>
+<robot_action_897>
+<robot_action_898>
+<robot_action_899>
+<robot_action_900>
+<robot_action_901>
+<robot_action_902>
+<robot_action_903>
+<robot_action_904>
+<robot_action_905>
+<robot_action_906>
+<robot_action_907>
+<robot_action_908>
+<robot_action_909>
+<robot_action_910>
+<robot_action_911>
+<robot_action_912>
+<robot_action_913>
+<robot_action_914>
+<robot_action_915>
+<robot_action_916>
+<robot_action_917>
+<robot_action_918>
+<robot_action_919>
+<robot_action_920>
+<robot_action_921>
+<robot_action_922>
+<robot_action_923>
+<robot_action_924>
+<robot_action_925>
+<robot_action_926>
+<robot_action_927>
+<robot_action_928>
+<robot_action_929>
+<robot_action_930>
+<robot_action_931>
+<robot_action_932>
+<robot_action_933>
+<robot_action_934>
+<robot_action_935>
+<robot_action_936>
+<robot_action_937>
+<robot_action_938>
+<robot_action_939>
+<robot_action_940>
+<robot_action_941>
+<robot_action_942>
+<robot_action_943>
+<robot_action_944>
+<robot_action_945>
+<robot_action_946>
+<robot_action_947>
+<robot_action_948>
+<robot_action_949>
+<robot_action_950>
+<robot_action_951>
+<robot_action_952>
+<robot_action_953>
+<robot_action_954>
+<robot_action_955>
+<robot_action_956>
+<robot_action_957>
+<robot_action_958>
+<robot_action_959>
+<robot_action_960>
+<robot_action_961>
+<robot_action_962>
+<robot_action_963>
+<robot_action_964>
+<robot_action_965>
+<robot_action_966>
+<robot_action_967>
+<robot_action_968>
+<robot_action_969>
+<robot_action_970>
+<robot_action_971>
+<robot_action_972>
+<robot_action_973>
+<robot_action_974>
+<robot_action_975>
+<robot_action_976>
+<robot_action_977>
+<robot_action_978>
+<robot_action_979>
+<robot_action_980>
+<robot_action_981>
+<robot_action_982>
+<robot_action_983>
+<robot_action_984>
+<robot_action_985>
+<robot_action_986>
+<robot_action_987>
+<robot_action_988>
+<robot_action_989>
+<robot_action_990>
+<robot_action_991>
+<robot_action_992>
+<robot_action_993>
+<robot_action_994>
+<robot_action_995>
+<robot_action_996>
+<robot_action_997>
+<robot_action_998>
+<robot_action_999>
+<robot_action_1000>
+<robot_action_1001>
+<robot_action_1002>
+<robot_action_1003>
+<robot_action_1004>
+<robot_action_1005>
+<robot_action_1006>
+<robot_action_1007>
+<robot_action_1008>
+<robot_action_1009>
+<robot_action_1010>
+<robot_action_1011>
+<robot_action_1012>
+<robot_action_1013>
+<robot_action_1014>
+<robot_action_1015>
+<robot_action_1016>
+<robot_action_1017>
+<robot_action_1018>
+<robot_action_1019>
+<robot_action_1020>
+<robot_action_1021>
+<robot_action_1022>
+<robot_action_1023>
+<robot_action_1024>
+<robot_action_1025>
+<robot_action_1026>
+<robot_action_1027>
+<robot_action_1028>
+<robot_action_1029>
+<robot_action_1030>
+<robot_action_1031>
+<robot_action_1032>
+<robot_action_1033>
+<robot_action_1034>
+<robot_action_1035>
+<robot_action_1036>
+<robot_action_1037>
+<robot_action_1038>
+<robot_action_1039>
+<robot_action_1040>
+<robot_action_1041>
+<robot_action_1042>
+<robot_action_1043>
+<robot_action_1044>
+<robot_action_1045>
+<robot_action_1046>
+<robot_action_1047>
+<robot_action_1048>
+<robot_action_1049>
+<robot_action_1050>
+<robot_action_1051>
+<robot_action_1052>
+<robot_action_1053>
+<robot_action_1054>
+<robot_action_1055>
+<robot_action_1056>
+<robot_action_1057>
+<robot_action_1058>
+<robot_action_1059>
+<robot_action_1060>
+<robot_action_1061>
+<robot_action_1062>
+<robot_action_1063>
+<robot_action_1064>
+<robot_action_1065>
+<robot_action_1066>
+<robot_action_1067>
+<robot_action_1068>
+<robot_action_1069>
+<robot_action_1070>
+<robot_action_1071>
+<robot_action_1072>
+<robot_action_1073>
+<robot_action_1074>
+<robot_action_1075>
+<robot_action_1076>
+<robot_action_1077>
+<robot_action_1078>
+<robot_action_1079>
+<robot_action_1080>
+<robot_action_1081>
+<robot_action_1082>
+<robot_action_1083>
+<robot_action_1084>
+<robot_action_1085>
+<robot_action_1086>
+<robot_action_1087>
+<robot_action_1088>
+<robot_action_1089>
+<robot_action_1090>
+<robot_action_1091>
+<robot_action_1092>
+<robot_action_1093>
+<robot_action_1094>
+<robot_action_1095>
+<robot_action_1096>
+<robot_action_1097>
+<robot_action_1098>
+<robot_action_1099>
+<robot_action_1100>
+<robot_action_1101>
+<robot_action_1102>
+<robot_action_1103>
+<robot_action_1104>
+<robot_action_1105>
+<robot_action_1106>
+<robot_action_1107>
+<robot_action_1108>
+<robot_action_1109>
+<robot_action_1110>
+<robot_action_1111>
+<robot_action_1112>
+<robot_action_1113>
+<robot_action_1114>
+<robot_action_1115>
+<robot_action_1116>
+<robot_action_1117>
+<robot_action_1118>
+<robot_action_1119>
+<robot_action_1120>
+<robot_action_1121>
+<robot_action_1122>
+<robot_action_1123>
+<robot_action_1124>
+<robot_action_1125>
+<robot_action_1126>
+<robot_action_1127>
+<robot_action_1128>
+<robot_action_1129>
+<robot_action_1130>
+<robot_action_1131>
+<robot_action_1132>
+<robot_action_1133>
+<robot_action_1134>
+<robot_action_1135>
+<robot_action_1136>
+<robot_action_1137>
+<robot_action_1138>
+<robot_action_1139>
+<robot_action_1140>
+<robot_action_1141>
+<robot_action_1142>
+<robot_action_1143>
+<robot_action_1144>
+<robot_action_1145>
+<robot_action_1146>
+<robot_action_1147>
+<robot_action_1148>
+<robot_action_1149>
+<robot_action_1150>
+<robot_action_1151>
+<robot_action_1152>
+<robot_action_1153>
+<robot_action_1154>
+<robot_action_1155>
+<robot_action_1156>
+<robot_action_1157>
+<robot_action_1158>
+<robot_action_1159>
+<robot_action_1160>
+<robot_action_1161>
+<robot_action_1162>
+<robot_action_1163>
+<robot_action_1164>
+<robot_action_1165>
+<robot_action_1166>
+<robot_action_1167>
+<robot_action_1168>
+<robot_action_1169>
+<robot_action_1170>
+<robot_action_1171>
+<robot_action_1172>
+<robot_action_1173>
+<robot_action_1174>
+<robot_action_1175>
+<robot_action_1176>
+<robot_action_1177>
+<robot_action_1178>
+<robot_action_1179>
+<robot_action_1180>
+<robot_action_1181>
+<robot_action_1182>
+<robot_action_1183>
+<robot_action_1184>
+<robot_action_1185>
+<robot_action_1186>
+<robot_action_1187>
+<robot_action_1188>
+<robot_action_1189>
+<robot_action_1190>
+<robot_action_1191>
+<robot_action_1192>
+<robot_action_1193>
+<robot_action_1194>
+<robot_action_1195>
+<robot_action_1196>
+<robot_action_1197>
+<robot_action_1198>
+<robot_action_1199>
+<robot_action_1200>
+<robot_action_1201>
+<robot_action_1202>
+<robot_action_1203>
+<robot_action_1204>
+<robot_action_1205>
+<robot_action_1206>
+<robot_action_1207>
+<robot_action_1208>
+<robot_action_1209>
+<robot_action_1210>
+<robot_action_1211>
+<robot_action_1212>
+<robot_action_1213>
+<robot_action_1214>
+<robot_action_1215>
+<robot_action_1216>
+<robot_action_1217>
+<robot_action_1218>
+<robot_action_1219>
+<robot_action_1220>
+<robot_action_1221>
+<robot_action_1222>
+<robot_action_1223>
+<robot_action_1224>
+<robot_action_1225>
+<robot_action_1226>
+<robot_action_1227>
+<robot_action_1228>
+<robot_action_1229>
+<robot_action_1230>
+<robot_action_1231>
+<robot_action_1232>
+<robot_action_1233>
+<robot_action_1234>
+<robot_action_1235>
+<robot_action_1236>
+<robot_action_1237>
+<robot_action_1238>
+<robot_action_1239>
+<robot_action_1240>
+<robot_action_1241>
+<robot_action_1242>
+<robot_action_1243>
+<robot_action_1244>
+<robot_action_1245>
+<robot_action_1246>
+<robot_action_1247>
+<robot_action_1248>
+<robot_action_1249>
+<robot_action_1250>
+<robot_action_1251>
+<robot_action_1252>
+<robot_action_1253>
+<robot_action_1254>
+<robot_action_1255>
+<robot_action_1256>
+<robot_action_1257>
+<robot_action_1258>
+<robot_action_1259>
+<robot_action_1260>
+<robot_action_1261>
+<robot_action_1262>
+<robot_action_1263>
+<robot_action_1264>
+<robot_action_1265>
+<robot_action_1266>
+<robot_action_1267>
+<robot_action_1268>
+<robot_action_1269>
+<robot_action_1270>
+<robot_action_1271>
+<robot_action_1272>
+<robot_action_1273>
+<robot_action_1274>
+<robot_action_1275>
+<robot_action_1276>
+<robot_action_1277>
+<robot_action_1278>
+<robot_action_1279>
+<robot_action_1280>
+<robot_action_1281>
+<robot_action_1282>
+<robot_action_1283>
+<robot_action_1284>
+<robot_action_1285>
+<robot_action_1286>
+<robot_action_1287>
+<robot_action_1288>
+<robot_action_1289>
+<robot_action_1290>
+<robot_action_1291>
+<robot_action_1292>
+<robot_action_1293>
+<robot_action_1294>
+<robot_action_1295>
+<robot_action_1296>
+<robot_action_1297>
+<robot_action_1298>
+<robot_action_1299>
+<robot_action_1300>
+<robot_action_1301>
+<robot_action_1302>
+<robot_action_1303>
+<robot_action_1304>
+<robot_action_1305>
+<robot_action_1306>
+<robot_action_1307>
+<robot_action_1308>
+<robot_action_1309>
+<robot_action_1310>
+<robot_action_1311>
+<robot_action_1312>
+<robot_action_1313>
+<robot_action_1314>
+<robot_action_1315>
+<robot_action_1316>
+<robot_action_1317>
+<robot_action_1318>
+<robot_action_1319>
+<robot_action_1320>
+<robot_action_1321>
+<robot_action_1322>
+<robot_action_1323>
+<robot_action_1324>
+<robot_action_1325>
+<robot_action_1326>
+<robot_action_1327>
+<robot_action_1328>
+<robot_action_1329>
+<robot_action_1330>
+<robot_action_1331>
+<robot_action_1332>
+<robot_action_1333>
+<robot_action_1334>
+<robot_action_1335>
+<robot_action_1336>
+<robot_action_1337>
+<robot_action_1338>
+<robot_action_1339>
+<robot_action_1340>
+<robot_action_1341>
+<robot_action_1342>
+<robot_action_1343>
+<robot_action_1344>
+<robot_action_1345>
+<robot_action_1346>
+<robot_action_1347>
+<robot_action_1348>
+<robot_action_1349>
+<robot_action_1350>
+<robot_action_1351>
+<robot_action_1352>
+<robot_action_1353>
+<robot_action_1354>
+<robot_action_1355>
+<robot_action_1356>
+<robot_action_1357>
+<robot_action_1358>
+<robot_action_1359>
+<robot_action_1360>
+<robot_action_1361>
+<robot_action_1362>
+<robot_action_1363>
+<robot_action_1364>
+<robot_action_1365>
+<robot_action_1366>
+<robot_action_1367>
+<robot_action_1368>
+<robot_action_1369>
+<robot_action_1370>
+<robot_action_1371>
+<robot_action_1372>
+<robot_action_1373>
+<robot_action_1374>
+<robot_action_1375>
+<robot_action_1376>
+<robot_action_1377>
+<robot_action_1378>
+<robot_action_1379>
+<robot_action_1380>
+<robot_action_1381>
+<robot_action_1382>
+<robot_action_1383>
+<robot_action_1384>
+<robot_action_1385>
+<robot_action_1386>
+<robot_action_1387>
+<robot_action_1388>
+<robot_action_1389>
+<robot_action_1390>
+<robot_action_1391>
+<robot_action_1392>
+<robot_action_1393>
+<robot_action_1394>
+<robot_action_1395>
+<robot_action_1396>
+<robot_action_1397>
+<robot_action_1398>
+<robot_action_1399>
+<robot_action_1400>
+<robot_action_1401>
+<robot_action_1402>
+<robot_action_1403>
+<robot_action_1404>
+<robot_action_1405>
+<robot_action_1406>
+<robot_action_1407>
+<robot_action_1408>
+<robot_action_1409>
+<robot_action_1410>
+<robot_action_1411>
+<robot_action_1412>
+<robot_action_1413>
+<robot_action_1414>
+<robot_action_1415>
+<robot_action_1416>
+<robot_action_1417>
+<robot_action_1418>
+<robot_action_1419>
+<robot_action_1420>
+<robot_action_1421>
+<robot_action_1422>
+<robot_action_1423>
+<robot_action_1424>
+<robot_action_1425>
+<robot_action_1426>
+<robot_action_1427>
+<robot_action_1428>
+<robot_action_1429>
+<robot_action_1430>
+<robot_action_1431>
+<robot_action_1432>
+<robot_action_1433>
+<robot_action_1434>
+<robot_action_1435>
+<robot_action_1436>
+<robot_action_1437>
+<robot_action_1438>
+<robot_action_1439>
+<robot_action_1440>
+<robot_action_1441>
+<robot_action_1442>
+<robot_action_1443>
+<robot_action_1444>
+<robot_action_1445>
+<robot_action_1446>
+<robot_action_1447>
+<robot_action_1448>
+<robot_action_1449>
+<robot_action_1450>
+<robot_action_1451>
+<robot_action_1452>
+<robot_action_1453>
+<robot_action_1454>
+<robot_action_1455>
+<robot_action_1456>
+<robot_action_1457>
+<robot_action_1458>
+<robot_action_1459>
+<robot_action_1460>
+<robot_action_1461>
+<robot_action_1462>
+<robot_action_1463>
+<robot_action_1464>
+<robot_action_1465>
+<robot_action_1466>
+<robot_action_1467>
+<robot_action_1468>
+<robot_action_1469>
+<robot_action_1470>
+<robot_action_1471>
+<robot_action_1472>
+<robot_action_1473>
+<robot_action_1474>
+<robot_action_1475>
+<robot_action_1476>
+<robot_action_1477>
+<robot_action_1478>
+<robot_action_1479>
+<robot_action_1480>
+<robot_action_1481>
+<robot_action_1482>
+<robot_action_1483>
+<robot_action_1484>
+<robot_action_1485>
+<robot_action_1486>
+<robot_action_1487>
+<robot_action_1488>
+<robot_action_1489>
+<robot_action_1490>
+<robot_action_1491>
+<robot_action_1492>
+<robot_action_1493>
+<robot_action_1494>
+<robot_action_1495>
+<robot_action_1496>
+<robot_action_1497>
+<robot_action_1498>
+<robot_action_1499>
+<robot_action_1500>
+<robot_action_1501>
+<robot_action_1502>
+<robot_action_1503>
+<robot_action_1504>
+<robot_action_1505>
+<robot_action_1506>
+<robot_action_1507>
+<robot_action_1508>
+<robot_action_1509>
+<robot_action_1510>
+<robot_action_1511>
+<robot_action_1512>
+<robot_action_1513>
+<robot_action_1514>
+<robot_action_1515>
+<robot_action_1516>
+<robot_action_1517>
+<robot_action_1518>
+<robot_action_1519>
+<robot_action_1520>
+<robot_action_1521>
+<robot_action_1522>
+<robot_action_1523>
+<robot_action_1524>
+<robot_action_1525>
+<robot_action_1526>
+<robot_action_1527>
+<robot_action_1528>
+<robot_action_1529>
+<robot_action_1530>
+<robot_action_1531>
+<robot_action_1532>
+<robot_action_1533>
+<robot_action_1534>
+<robot_action_1535>
+<robot_action_1536>
+<robot_action_1537>
+<robot_action_1538>
+<robot_action_1539>
+<robot_action_1540>
+<robot_action_1541>
+<robot_action_1542>
+<robot_action_1543>
+<robot_action_1544>
+<robot_action_1545>
+<robot_action_1546>
+<robot_action_1547>
+<robot_action_1548>
+<robot_action_1549>
+<robot_action_1550>
+<robot_action_1551>
+<robot_action_1552>
+<robot_action_1553>
+<robot_action_1554>
+<robot_action_1555>
+<robot_action_1556>
+<robot_action_1557>
+<robot_action_1558>
+<robot_action_1559>
+<robot_action_1560>
+<robot_action_1561>
+<robot_action_1562>
+<robot_action_1563>
+<robot_action_1564>
+<robot_action_1565>
+<robot_action_1566>
+<robot_action_1567>
+<robot_action_1568>
+<robot_action_1569>
+<robot_action_1570>
+<robot_action_1571>
+<robot_action_1572>
+<robot_action_1573>
+<robot_action_1574>
+<robot_action_1575>
+<robot_action_1576>
+<robot_action_1577>
+<robot_action_1578>
+<robot_action_1579>
+<robot_action_1580>
+<robot_action_1581>
+<robot_action_1582>
+<robot_action_1583>
+<robot_action_1584>
+<robot_action_1585>
+<robot_action_1586>
+<robot_action_1587>
+<robot_action_1588>
+<robot_action_1589>
+<robot_action_1590>
+<robot_action_1591>
+<robot_action_1592>
+<robot_action_1593>
+<robot_action_1594>
+<robot_action_1595>
+<robot_action_1596>
+<robot_action_1597>
+<robot_action_1598>
+<robot_action_1599>
+<robot_action_1600>
+<robot_action_1601>
+<robot_action_1602>
+<robot_action_1603>
+<robot_action_1604>
+<robot_action_1605>
+<robot_action_1606>
+<robot_action_1607>
+<robot_action_1608>
+<robot_action_1609>
+<robot_action_1610>
+<robot_action_1611>
+<robot_action_1612>
+<robot_action_1613>
+<robot_action_1614>
+<robot_action_1615>
+<robot_action_1616>
+<robot_action_1617>
+<robot_action_1618>
+<robot_action_1619>
+<robot_action_1620>
+<robot_action_1621>
+<robot_action_1622>
+<robot_action_1623>
+<robot_action_1624>
+<robot_action_1625>
+<robot_action_1626>
+<robot_action_1627>
+<robot_action_1628>
+<robot_action_1629>
+<robot_action_1630>
+<robot_action_1631>
+<robot_action_1632>
+<robot_action_1633>
+<robot_action_1634>
+<robot_action_1635>
+<robot_action_1636>
+<robot_action_1637>
+<robot_action_1638>
+<robot_action_1639>
+<robot_action_1640>
+<robot_action_1641>
+<robot_action_1642>
+<robot_action_1643>
+<robot_action_1644>
+<robot_action_1645>
+<robot_action_1646>
+<robot_action_1647>
+<robot_action_1648>
+<robot_action_1649>
+<robot_action_1650>
+<robot_action_1651>
+<robot_action_1652>
+<robot_action_1653>
+<robot_action_1654>
+<robot_action_1655>
+<robot_action_1656>
+<robot_action_1657>
+<robot_action_1658>
+<robot_action_1659>
+<robot_action_1660>
+<robot_action_1661>
+<robot_action_1662>
+<robot_action_1663>
+<robot_action_1664>
+<robot_action_1665>
+<robot_action_1666>
+<robot_action_1667>
+<robot_action_1668>
+<robot_action_1669>
+<robot_action_1670>
+<robot_action_1671>
+<robot_action_1672>
+<robot_action_1673>
+<robot_action_1674>
+<robot_action_1675>
+<robot_action_1676>
+<robot_action_1677>
+<robot_action_1678>
+<robot_action_1679>
+<robot_action_1680>
+<robot_action_1681>
+<robot_action_1682>
+<robot_action_1683>
+<robot_action_1684>
+<robot_action_1685>
+<robot_action_1686>
+<robot_action_1687>
+<robot_action_1688>
+<robot_action_1689>
+<robot_action_1690>
+<robot_action_1691>
+<robot_action_1692>
+<robot_action_1693>
+<robot_action_1694>
+<robot_action_1695>
+<robot_action_1696>
+<robot_action_1697>
+<robot_action_1698>
+<robot_action_1699>
+<robot_action_1700>
+<robot_action_1701>
+<robot_action_1702>
+<robot_action_1703>
+<robot_action_1704>
+<robot_action_1705>
+<robot_action_1706>
+<robot_action_1707>
+<robot_action_1708>
+<robot_action_1709>
+<robot_action_1710>
+<robot_action_1711>
+<robot_action_1712>
+<robot_action_1713>
+<robot_action_1714>
+<robot_action_1715>
+<robot_action_1716>
+<robot_action_1717>
+<robot_action_1718>
+<robot_action_1719>
+<robot_action_1720>
+<robot_action_1721>
+<robot_action_1722>
+<robot_action_1723>
+<robot_action_1724>
+<robot_action_1725>
+<robot_action_1726>
+<robot_action_1727>
+<robot_action_1728>
+<robot_action_1729>
+<robot_action_1730>
+<robot_action_1731>
+<robot_action_1732>
+<robot_action_1733>
+<robot_action_1734>
+<robot_action_1735>
+<robot_action_1736>
+<robot_action_1737>
+<robot_action_1738>
+<robot_action_1739>
+<robot_action_1740>
+<robot_action_1741>
+<robot_action_1742>
+<robot_action_1743>
+<robot_action_1744>
+<robot_action_1745>
+<robot_action_1746>
+<robot_action_1747>
+<robot_action_1748>
+<robot_action_1749>
+<robot_action_1750>
+<robot_action_1751>
+<robot_action_1752>
+<robot_action_1753>
+<robot_action_1754>
+<robot_action_1755>
+<robot_action_1756>
+<robot_action_1757>
+<robot_action_1758>
+<robot_action_1759>
+<robot_action_1760>
+<robot_action_1761>
+<robot_action_1762>
+<robot_action_1763>
+<robot_action_1764>
+<robot_action_1765>
+<robot_action_1766>
+<robot_action_1767>
+<robot_action_1768>
+<robot_action_1769>
+<robot_action_1770>
+<robot_action_1771>
+<robot_action_1772>
+<robot_action_1773>
+<robot_action_1774>
+<robot_action_1775>
+<robot_action_1776>
+<robot_action_1777>
+<robot_action_1778>
+<robot_action_1779>
+<robot_action_1780>
+<robot_action_1781>
+<robot_action_1782>
+<robot_action_1783>
+<robot_action_1784>
+<robot_action_1785>
+<robot_action_1786>
+<robot_action_1787>
+<robot_action_1788>
+<robot_action_1789>
+<robot_action_1790>
+<robot_action_1791>
+<robot_action_1792>
+<robot_action_1793>
+<robot_action_1794>
+<robot_action_1795>
+<robot_action_1796>
+<robot_action_1797>
+<robot_action_1798>
+<robot_action_1799>
+<robot_action_1800>
+<robot_action_1801>
+<robot_action_1802>
+<robot_action_1803>
+<robot_action_1804>
+<robot_action_1805>
+<robot_action_1806>
+<robot_action_1807>
+<robot_action_1808>
+<robot_action_1809>
+<robot_action_1810>
+<robot_action_1811>
+<robot_action_1812>
+<robot_action_1813>
+<robot_action_1814>
+<robot_action_1815>
+<robot_action_1816>
+<robot_action_1817>
+<robot_action_1818>
+<robot_action_1819>
+<robot_action_1820>
+<robot_action_1821>
+<robot_action_1822>
+<robot_action_1823>
+<robot_action_1824>
+<robot_action_1825>
+<robot_action_1826>
+<robot_action_1827>
+<robot_action_1828>
+<robot_action_1829>
+<robot_action_1830>
+<robot_action_1831>
+<robot_action_1832>
+<robot_action_1833>
+<robot_action_1834>
+<robot_action_1835>
+<robot_action_1836>
+<robot_action_1837>
+<robot_action_1838>
+<robot_action_1839>
+<robot_action_1840>
+<robot_action_1841>
+<robot_action_1842>
+<robot_action_1843>
+<robot_action_1844>
+<robot_action_1845>
+<robot_action_1846>
+<robot_action_1847>
+<robot_action_1848>
+<robot_action_1849>
+<robot_action_1850>
+<robot_action_1851>
+<robot_action_1852>
+<robot_action_1853>
+<robot_action_1854>
+<robot_action_1855>
+<robot_action_1856>
+<robot_action_1857>
+<robot_action_1858>
+<robot_action_1859>
+<robot_action_1860>
+<robot_action_1861>
+<robot_action_1862>
+<robot_action_1863>
+<robot_action_1864>
+<robot_action_1865>
+<robot_action_1866>
+<robot_action_1867>
+<robot_action_1868>
+<robot_action_1869>
+<robot_action_1870>
+<robot_action_1871>
+<robot_action_1872>
+<robot_action_1873>
+<robot_action_1874>
+<robot_action_1875>
+<robot_action_1876>
+<robot_action_1877>
+<robot_action_1878>
+<robot_action_1879>
+<robot_action_1880>
+<robot_action_1881>
+<robot_action_1882>
+<robot_action_1883>
+<robot_action_1884>
+<robot_action_1885>
+<robot_action_1886>
+<robot_action_1887>
+<robot_action_1888>
+<robot_action_1889>
+<robot_action_1890>
+<robot_action_1891>
+<robot_action_1892>
+<robot_action_1893>
+<robot_action_1894>
+<robot_action_1895>
+<robot_action_1896>
+<robot_action_1897>
+<robot_action_1898>
+<robot_action_1899>
+<robot_action_1900>
+<robot_action_1901>
+<robot_action_1902>
+<robot_action_1903>
+<robot_action_1904>
+<robot_action_1905>
+<robot_action_1906>
+<robot_action_1907>
+<robot_action_1908>
+<robot_action_1909>
+<robot_action_1910>
+<robot_action_1911>
+<robot_action_1912>
+<robot_action_1913>
+<robot_action_1914>
+<robot_action_1915>
+<robot_action_1916>
+<robot_action_1917>
+<robot_action_1918>
+<robot_action_1919>
+<robot_action_1920>
+<robot_action_1921>
+<robot_action_1922>
+<robot_action_1923>
+<robot_action_1924>
+<robot_action_1925>
+<robot_action_1926>
+<robot_action_1927>
+<robot_action_1928>
+<robot_action_1929>
+<robot_action_1930>
+<robot_action_1931>
+<robot_action_1932>
+<robot_action_1933>
+<robot_action_1934>
+<robot_action_1935>
+<robot_action_1936>
+<robot_action_1937>
+<robot_action_1938>
+<robot_action_1939>
+<robot_action_1940>
+<robot_action_1941>
+<robot_action_1942>
+<robot_action_1943>
+<robot_action_1944>
+<robot_action_1945>
+<robot_action_1946>
+<robot_action_1947>
+<robot_action_1948>
+<robot_action_1949>
+<robot_action_1950>
+<robot_action_1951>
+<robot_action_1952>
+<robot_action_1953>
+<robot_action_1954>
+<robot_action_1955>
+<robot_action_1956>
+<robot_action_1957>
+<robot_action_1958>
+<robot_action_1959>
+<robot_action_1960>
+<robot_action_1961>
+<robot_action_1962>
+<robot_action_1963>
+<robot_action_1964>
+<robot_action_1965>
+<robot_action_1966>
+<robot_action_1967>
+<robot_action_1968>
+<robot_action_1969>
+<robot_action_1970>
+<robot_action_1971>
+<robot_action_1972>
+<robot_action_1973>
+<robot_action_1974>
+<robot_action_1975>
+<robot_action_1976>
+<robot_action_1977>
+<robot_action_1978>
+<robot_action_1979>
+<robot_action_1980>
+<robot_action_1981>
+<robot_action_1982>
+<robot_action_1983>
+<robot_action_1984>
+<robot_action_1985>
+<robot_action_1986>
+<robot_action_1987>
+<robot_action_1988>
+<robot_action_1989>
+<robot_action_1990>
+<robot_action_1991>
+<robot_action_1992>
+<robot_action_1993>
+<robot_action_1994>
+<robot_action_1995>
+<robot_action_1996>
+<robot_action_1997>
+<robot_action_1998>
+<robot_action_1999>
+<robot_action_2000>
+<robot_action_2001>
+<robot_action_2002>
+<robot_action_2003>
+<robot_action_2004>
+<robot_action_2005>
+<robot_action_2006>
+<robot_action_2007>
+<robot_action_2008>
+<robot_action_2009>
+<robot_action_2010>
+<robot_action_2011>
+<robot_action_2012>
+<robot_action_2013>
+<robot_action_2014>
+<robot_action_2015>
+<robot_action_2016>
+<robot_action_2017>
+<robot_action_2018>
+<robot_action_2019>
+<robot_action_2020>
+<robot_action_2021>
+<robot_action_2022>
+<robot_action_2023>
+<robot_action_2024>
+<robot_action_2025>
+<robot_action_2026>
+<robot_action_2027>
+<robot_action_2028>
+<robot_action_2029>
+<robot_action_2030>
+<robot_action_2031>
+<robot_action_2032>
+<robot_action_2033>
+<robot_action_2034>
+<robot_action_2035>
+<robot_action_2036>
+<robot_action_2037>
+<robot_action_2038>
+<robot_action_2039>
+<robot_action_2040>
+<robot_action_2041>
+<robot_action_2042>
+<robot_action_2043>
+<robot_action_2044>
+<robot_action_2045>
+<robot_action_2046>
+<robot_action_2047>
\ No newline at end of file
diff --git a/code/model/tools.py b/code/model/tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d0f30521850880d3904801994d31d77af261cd
--- /dev/null
+++ b/code/model/tools.py
@@ -0,0 +1,197 @@
+def auto_get_module_keys(module, max_depth=0, prefix_list=None, current_depth=0, current_prefix=""):
+    """
+    get all submodule keys of a module, support setting recursion depth and prefix list.
+
+    :param module: the module to traverse.
+    :param max_depth: the maximum recursion depth, default is 1.
+    :param prefix_list: only include modules with specified prefix, default is None means no restriction.
+    :param current_depth: the current recursion depth, internal use.
+    :param current_prefix: the current prefix, internal use.
+    :return: the list of module keys.
+    """
+    if current_depth > max_depth:
+        return []
+
+    module_keys = []
+    for name, sub_module in module.named_children():
+        full_name = f"{current_prefix}.{name}" if current_prefix else name
+        if prefix_list is None or any(full_name.startswith(prefix) for prefix in prefix_list):
+            module_keys.append(full_name)
+        module_keys.extend(auto_get_module_keys(sub_module, max_depth, prefix_list, current_depth + 1, full_name))
+    return module_keys
+
+
+def is_module_trainable(module):
+    """
+    check if a module is trainable: if the module itself has parameters, then all its parameters require_grad must be True;
+    if the module itself has no parameters, then its trainability depends on its submodules.
+    """
+    params = list(module.parameters(recurse=False))
+    if params:
+        return all(p.requires_grad for p in params)
+    else:
+        # for container modules with no direct parameters, consider them trainable (the final result depends on their submodules)
+        return True
+
+
+def auto_get_trainable_modules(module, prefix="", max_depth=None):
+    """
+    recursively traverse the module, return the list of all trainable module names.
+    if all submodules of a module are trainable, then only return the name of the parent module, no longer recursively output the names of its submodules.
+
+    parameters:
+      - module: the module to traverse.
+      - prefix: the name prefix of the current module (internal use).
+      - max_depth: the maximum recursion depth, None means infinite recursion.
+
+    return:
+      a list of module names.
+    """
+    # get all direct submodules of the current module
+    children = list(module.named_children())
+
+    # if the maximum depth is reached or there are no submodules, return the current module (if trainable and prefix is not empty)
+    if (max_depth is not None and max_depth <= 0) or not children:
+        return [prefix] if prefix and is_module_trainable(module) else []
+
+    child_keys = []
+    all_children_trainable = True
+    for name, child in children:
+        full_name = f"{prefix}.{name}" if prefix else name
+        # recursively get the trainable keys of the submodules
+        keys = auto_get_trainable_modules(child, full_name, None if max_depth is None else max_depth - 1)
+        if not keys:
+            # if the submodule does not return any further submodules, check the submodule itself
+            if is_module_trainable(child):
+                keys = [full_name]
+            else:
+                all_children_trainable = False
+        else:
+            # if the submodule returns multiple names, it means that it cannot be merged
+            if len(keys) > 1:
+                all_children_trainable = False
+        child_keys.extend(keys)
+
+    # if the current module is trainable and all submodules are trainable, return the name of the current module
+    if is_module_trainable(module) and all_children_trainable and child_keys:
+        return [prefix] if prefix else child_keys
+    else:
+        return child_keys
+
+
+def print_freeze_status(self):
+    """
+    for each top-level submodule, if all its parameters are in the same state (all frozen or all trainable), only print the top-level module.
+    if some top-level submodule has mixed parameter states (some frozen, some trainable), list the state of each parameter under the submodule.
+    """
+    from collections import defaultdict
+
+    # collect the state of parameters under each top-level module
+    status_dict = defaultdict(lambda: {"Frozen": 0, "Trainable": 0, "params": []})
+    for full_name, param in self.named_parameters():
+        # full_name is like "qwen_vl_interface.model.layer.weight"
+        top_module = full_name.split(".", 1)[0]  # get the top-level module name
+        state = "Frozen" if not param.requires_grad else "Trainable"
+        status_dict[top_module]["params"].append((full_name, state))
+        status_dict[top_module][state] += 1
+
+    print("=== module parameter freezing status ===")
+    for top_module, info in status_dict.items():
+        frozen_count = info["Frozen"]
+        trainable_count = info["Trainable"]
+
+        if frozen_count > 0 and trainable_count == 0:
+            # all frozen
+            print(f"{top_module:40s}  |  all Frozen ({frozen_count} parameters)")
+        elif trainable_count > 0 and frozen_count == 0:
+            # all trainable
+            print(f"{top_module:40s}  |  all Trainable ({trainable_count} parameters)")
+        else:
+            # mixed state, first print the module name summary, then list the state of each parameter
+            print(f"{top_module:40s}  |  mixed state → Frozen: {frozen_count}, Trainable: {trainable_count}")
+            for pname, pstate in info["params"]:
+                print(f"    {pname:60s}  |  {pstate}")
+    print("=========================\n")
+
+
+
+class Registry:
+    def __init__(self, name: str):
+        self.name = name
+        self._registry = {}
+
+    def register(self, key: str):
+        """Decorator: register a builder function or class"""
+        def decorator(framework_class):
+            if key in self._registry:
+                # print(ImportWarning(f"{key} already registered to {self.name}"))
+                pass
+            self._registry[key] = framework_class
+            return framework_class
+        return decorator
+    
+    def __getitem__(self, key):
+        return self._registry[key]
+    
+    def list(self):
+        """
+        List currently registered keys; if with_values=True (not used here) return mapping {key: value_obj}.
+        Using class name as value is also intuitive, e.g., framework.__name__.
+        """
+        return {k: v for k, v in self._registry.items()}
+
+FRAMEWORK_REGISTRY = Registry("frameworks")
+
+
+
+from starVLA.training.trainer_utils import initialize_overwatch
+import os
+import json
+from pathlib import Path
+from omegaconf import OmegaConf
+
+# Initialize Overwatch =>> Wraps `logging.Logger`
+overwatch = initialize_overwatch(__name__)
+
+def read_mode_config(pretrained_checkpoint):
+    """
+    Same as read_model_config (legacy duplicate kept for backward compatibility).
+
+    Args:
+        pretrained_checkpoint: Path to a .pt checkpoint file.
+
+    Returns:
+        tuple:
+            vla_cfg (dict)
+            norm_stats (dict)
+    """
+    if os.path.isfile(pretrained_checkpoint):
+        overwatch.info(f"Loading from local checkpoint path `{(checkpoint_pt := Path(pretrained_checkpoint))}`")
+
+        # [Validate] Checkpoint Path should look like `.../<RUN_ID>/checkpoints/<CHECKPOINT_PATH>.pt|.safetensors`
+        assert checkpoint_pt.suffix in (".pt", ".safetensors"), \
+            f"Unsupported checkpoint suffix `{checkpoint_pt.suffix}`, expected `.pt` or `.safetensors`"
+        run_dir = checkpoint_pt.parents[1]
+
+        # Get paths for `config.json`, `dataset_statistics.json` and pretrained checkpoint
+        config_yaml, dataset_statistics_json = run_dir / "config.yaml", run_dir / "dataset_statistics.json"
+        assert config_yaml.exists(), f"Missing `config.yaml` for `{run_dir}`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir}`"
+
+        # Otherwise =>> try looking for a match on `model_id_or_path` on the HF Hub (`model_id_or_path`)
+        # Load VLA Config (and corresponding base VLM `ModelConfig`) from `config.json`
+        try:
+            ocfg = OmegaConf.load(str(config_yaml))
+            global_cfg = OmegaConf.to_container(ocfg, resolve=True)
+        except Exception as e:
+            overwatch.error(f"❌ Failed to load YAML config `{config_yaml}`: {e}")
+            raise
+
+        # Load Dataset Statistics for Action Denormalization
+        with open(dataset_statistics_json, "r") as f:
+            norm_stats = json.load(f)
+    else:
+        overwatch.error(f"❌ Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+        raise FileNotFoundError(f"Pretrained checkpoint `{pretrained_checkpoint}` does not exist.")
+    return global_cfg, norm_stats
+
diff --git a/code/training/__init__.py b/code/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/code/training/__pycache__/__init__.cpython-310.pyc b/code/training/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77e17b0d6fcdda064d52d1e41f4ad78813f7aaec
Binary files /dev/null and b/code/training/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/training/__pycache__/__init__.cpython-311.pyc b/code/training/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb5c2c366ad34aedb9188d7d84b46071c618e9e
Binary files /dev/null and b/code/training/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/training/__pycache__/train_qwengr00t.cpython-313.pyc b/code/training/__pycache__/train_qwengr00t.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f7e45fc862f7d25f32339a855d3364473d26cea
Binary files /dev/null and b/code/training/__pycache__/train_qwengr00t.cpython-313.pyc differ
diff --git a/code/training/__pycache__/train_qwenlatent.cpython-310.pyc b/code/training/__pycache__/train_qwenlatent.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..232c1fe4f0fd0e9735cce95dbfd2ebdb79042970
Binary files /dev/null and b/code/training/__pycache__/train_qwenlatent.cpython-310.pyc differ
diff --git a/code/training/train_actionmodel.py b/code/training/train_actionmodel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cea21e0ae5228a2ae483276ef562408ea698362
--- /dev/null
+++ b/code/training/train_actionmodel.py
@@ -0,0 +1,664 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+import sys
+sys.path.append("/mnt/data/fangyu/code/reward_new")
+
+"""
+StarVLA’s trainer is built directly on native PyTorch + Accelerate + DeepSpeed, keeping the loop explicit and easy to hack.
+Conventions:
+1. Store runtime state in dicts where possible (simplifies data info, procesing info, config, etc).
+2. Use multiple dataloaders to adapt heterogeneous data types / task mixtures.
+3. Put each training strategy in its own `trainer_*.py` file (avoid large if‑else chains).
+"""
+import warnings
+warnings.filterwarnings("ignore")
+# Standard Library
+import argparse
+import json
+import os
+os.environ["WANDB_API_KEY"] = "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"
+
+from pathlib import Path
+from typing import Tuple
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import time
+import glob
+import re
+
+# Third-Party Libraries
+import torch
+import torch.distributed as dist
+# import wandb
+import yaml
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from transformers import AutoProcessor, get_scheduler
+
+# Local Modules
+from starVLA.training.trainer_utils.trainer_tools import normalize_dotlist_args
+from starVLA.model.framework import build_framework
+from starVLA.training.trainer_utils.trainer_tools import TrainerUtils
+from starVLA.training.trainer_utils.trainer_tools import build_param_lr_groups
+
+deepspeed_plugin = DeepSpeedPlugin()
+accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
+accelerator.print(accelerator.state)
+
+# Sane Defaults
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+# Initialize Overwatch =>> Wraps `logging.Logger`
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def load_fast_tokenizer():
+    fast_tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True)
+    return fast_tokenizer
+
+
+def setup_directories(cfg) -> Path:
+    """create output directory and save config"""
+    cfg.output_dir = os.path.join(cfg.run_root_dir, cfg.run_id)
+    output_dir = Path(cfg.output_dir)
+
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        # create output directory and checkpoint directory
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(output_dir / "checkpoints", exist_ok=True)
+
+        # save config
+        OmegaConf.save(cfg, output_dir / "config.yaml")
+        with open(output_dir / "config.yaml", "r") as f_yaml, open(output_dir / "config.json", "w") as f_json:
+            yaml_cfg = yaml.safe_load(f_yaml)
+            json.dump(yaml_cfg, f_json, indent=2)
+
+    return output_dir
+
+
+def build_model(cfg) -> torch.nn.Module:
+    """build model framework"""
+    logger.info(f"Loading Base VLM `{cfg.framework.qwenvl.base_vlm}` from ID/Path")
+    model = build_framework(cfg)
+
+    return model
+
+
+# here changes need to 📦 encapsulate Dataloader
+from starVLA.dataloader import build_dataloader
+
+
+def prepare_data(cfg, accelerator, output_dir) -> Tuple[DataLoader, DataLoader]:
+    """prepare training data"""
+    # VLA data loader
+    logger.info(f"Creating VLA Dataset with Mixture `{cfg.datasets.vla_data.data_mix}`")
+    vla_train_dataloader = build_dataloader(cfg=cfg, dataset_py=cfg.datasets.vla_data.dataset_py)
+
+    accelerator.dataloader_config.dispatch_batches = False
+    dist.barrier()
+
+    return vla_train_dataloader
+
+
+def get_warmup_stable_cosine_scheduler(optimizer, num_warmup_steps, num_stable_steps, num_training_steps, min_lr_ratio=0.01):
+    """
+    Warmup → Stable → Cosine Decay scheduler
+    
+    Args:
+        optimizer: PyTorch optimizer
+        num_warmup_steps: warmup 阶段步数
+        num_stable_steps: 保持 max_lr 的步数（在 warmup 之后）
+        num_training_steps: 总训练步数
+        min_lr_ratio: 最终 lr / max_lr 的比例
+    
+    Returns:
+        LambdaLR scheduler
+    """
+    import math
+    
+    def lr_lambda(current_step):
+        # Warmup 阶段：线性增长
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        
+        # Stable 阶段：保持 max_lr
+        stable_end = num_warmup_steps + num_stable_steps
+        if current_step < stable_end:
+            return 1.0
+        
+        # Cosine decay 阶段
+        decay_steps = num_training_steps - stable_end
+        if decay_steps <= 0:
+            return min_lr_ratio
+        progress = float(current_step - stable_end) / float(decay_steps)
+        return min_lr_ratio + (1.0 - min_lr_ratio) * 0.5 * (1.0 + math.cos(math.pi * progress))
+    
+    # 为每个参数组提供相同的 lr_lambda（支持多参数组优化器）
+    num_param_groups = len(optimizer.param_groups)
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, [lr_lambda] * num_param_groups)
+
+
+def setup_optimizer_and_scheduler(model, cfg) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]:
+    """set optimizer and scheduler"""
+    # initialize optimizer
+    param_groups = build_param_lr_groups(model=model, cfg=cfg)
+    optimizer = torch.optim.AdamW(
+        param_groups,
+        lr=cfg.trainer.learning_rate.base,
+        betas=tuple(cfg.trainer.optimizer.betas),
+        weight_decay=cfg.trainer.optimizer.weight_decay,
+        eps=cfg.trainer.optimizer.eps,
+    )
+
+    # print optimizer group info
+    if dist.is_initialized() and dist.get_rank() == 0:
+        for i, group in enumerate(optimizer.param_groups):
+            logger.info(f"LR Group {group['name']}: lr={group['lr']}, num_params={len(group['params'])}")
+
+    # initialize learning rate scheduler
+    if cfg.trainer.lr_scheduler_type == "warmup_stable_cosine":
+        # 自定义 scheduler: Warmup → Stable → Cosine Decay
+        min_lr_ratio = cfg.trainer.scheduler_specific_kwargs.get("min_lr_ratio", 0.01)
+        num_stable_steps = cfg.trainer.get("num_stable_steps", 0)
+        lr_scheduler = get_warmup_stable_cosine_scheduler(
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_stable_steps=num_stable_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+            min_lr_ratio=min_lr_ratio,
+        )
+        if dist.is_initialized() and dist.get_rank() == 0:
+            logger.info(f"Using warmup_stable_cosine scheduler: warmup={cfg.trainer.num_warmup_steps}, "
+                       f"stable={num_stable_steps}, total={cfg.trainer.max_train_steps}, min_lr_ratio={min_lr_ratio}")
+    else:
+        # 使用 transformers 内置 scheduler
+        lr_scheduler = get_scheduler(
+            name=cfg.trainer.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+            scheduler_specific_kwargs=cfg.trainer.scheduler_specific_kwargs,
+        )
+
+    return optimizer, lr_scheduler
+
+
+class VLATrainer(TrainerUtils):
+    def __init__(self, cfg, model, vla_train_dataloader, optimizer, lr_scheduler, accelerator):
+        self.config = cfg
+        self.model = model
+        self.vla_train_dataloader = vla_train_dataloader
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.accelerator = accelerator
+        self._printed_first_batch = False
+        # training status tracking
+        self.completed_steps = 0
+        self.total_batch_size = self._calculate_total_batch_size()
+
+    def _debug_print_first_batch(self, batch) -> None:
+        if self._printed_first_batch or not self.accelerator.is_local_main_process:
+            return
+        self._printed_first_batch = True
+
+        sample = None
+        if isinstance(batch, list):
+            sample = batch[0] if len(batch) > 0 else None
+        elif isinstance(batch, dict):
+            sample = batch
+
+        if sample is None:
+            self.accelerator.print("First batch is empty.")
+            return
+
+        def _describe_value(value):
+            if hasattr(value, "shape"):
+                try:
+                    return f"{type(value).__name__}(shape={tuple(value.shape)})"
+                except Exception:
+                    return type(value).__name__
+            if isinstance(value, list):
+                inner = type(value[0]).__name__ if value else "empty"
+                return f"list(len={len(value)}, inner={inner})"
+            return type(value).__name__
+
+        self.accelerator.print(f"First batch type: {type(batch).__name__}")
+        if isinstance(batch, list):
+            self.accelerator.print(f"First batch size: {len(batch)}")
+        self.accelerator.print("First sample keys:")
+        for key, value in sample.items():
+            self.accelerator.print(f"  - {key}: {_describe_value(value)}")
+
+        # Print full content for first 5 samples to inspect inputs.
+        if isinstance(batch, list):
+            import numpy as np
+            max_samples = min(5, len(batch))
+            for i in range(max_samples):
+                self.accelerator.print(f"Sample[{i}] content:")
+                for key, value in batch[i].items():
+                    if hasattr(value, "shape"):
+                        try:
+                            value_str = np.array2string(
+                                value, threshold=np.inf, max_line_width=200
+                            )
+                        except Exception:
+                            value_str = repr(value)
+                    else:
+                        value_str = repr(value)
+                    self.accelerator.print(f"  - {key}: {value_str}")
+
+    def prepare_training(self):
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        seed = self.config.seed + rank if hasattr(self.config, "seed") else rank + 3047
+        set_seed(seed)
+
+        # load pretrained weights
+        if hasattr(self.config.trainer, "pretrained_checkpoint") and self.config.trainer.pretrained_checkpoint:
+            pretrained_checkpoint = self.config.trainer.pretrained_checkpoint
+            reload_modules = (
+                self.config.trainer.reload_modules if hasattr(self.config.trainer, "reload_modules") else None
+            )
+            self.model = self.load_pretrained_backbones(self.model, pretrained_checkpoint,
+                                                        reload_modules=reload_modules)
+
+        # freeze parameters
+        freeze_modules = (
+            self.config.trainer.freeze_modules
+            if (self.config and hasattr(self.config.trainer, "freeze_modules"))
+            else None
+        )
+        self.model = self.freeze_backbones(self.model, freeze_modules=freeze_modules)
+
+        #  print model trainable parameters:
+        self.print_trainable_parameters(self.model)
+
+        # build optimizer and scheduler AFTER freezing (critical for DeepSpeed ZeRO)
+        self.optimizer, self.lr_scheduler = setup_optimizer_and_scheduler(model=self.model, cfg=self.config)
+
+        # initialize distributed training components
+        # 注意：不传入 lr_scheduler，避免被 AcceleratedScheduler 包装（会导致 step 被调用 num_processes 倍）
+        self.model, self.optimizer, self.vla_train_dataloader = self.setup_distributed_training(
+            self.accelerator,  # must be the first param
+            self.model,
+            self.optimizer,
+            self.vla_train_dataloader,
+        )
+        # lr_scheduler 保持原始的 LambdaLR，不被 Accelerate 包装
+
+        # self._init_wandb()
+        self._init_checkpointing()
+
+    def _calculate_total_batch_size(self):
+        """calculate global batch size"""
+        return (
+                self.config.datasets.vla_data.per_device_batch_size
+                * self.accelerator.num_processes
+                * self.accelerator.gradient_accumulation_steps
+        )
+
+    def _init_wandb(self):
+        """initialize Weights & Biases"""
+        # if self.accelerator.is_main_process:
+        #     wandb.init(
+        #         name=self.config.run_id,
+        #         dir=os.path.join(self.config.output_dir, "wandb"),
+        #         project=self.config.wandb_project,
+        #         entity=self.config.wandb_entity,
+        #         group="vla-train",
+        #         settings=wandb.Settings(
+        #             _disable_stats=False,  # 确保启用系统监控
+        #             x_stats_sampling_interval=10.0,  # 每10秒采样一次系统指标
+        #         ),
+        #     )
+        pass
+
+    def _init_checkpointing(self):
+        """initialize checkpoint directory"""
+        self.checkpoint_dir = os.path.join(self.config.output_dir, "checkpoints")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+
+        pretrained_checkpoint = getattr(self.config.trainer, "pretrained_checkpoint", None)
+        is_resume = getattr(self.config.trainer, "is_resume", False)
+
+        # resume train ckpt
+        if pretrained_checkpoint and is_resume:
+            self._load_checkpoint(self.config.resume_from_checkpoint)
+
+    def _load_checkpoint(self, checkpoint_path):
+        """load checkpoint"""
+        self.accelerator.load_state(checkpoint_path)
+        self.accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+
+    def _save_checkpoint(self):
+        """save current training state"""
+
+        if self.accelerator.is_main_process:
+            checkpoint_path = os.path.join(self.checkpoint_dir, f"steps_{self.completed_steps}")
+            # save model state
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, checkpoint_path + "_pytorch_model.pt")
+
+            # save training metadata
+            summary_data = {
+                "steps": self.completed_steps,
+            }
+            with open(os.path.join(self.config.output_dir, "summary.jsonl"), "a") as f:
+                f.write(json.dumps(summary_data) + "\n")
+            self.accelerator.print(f"✅ Checkpoint saved at {checkpoint_path}")
+            
+            # 删除旧的checkpoint，只保留最近的N个
+            max_checkpoints = getattr(self.config.trainer, "max_checkpoints_to_keep", None)
+            if max_checkpoints is not None and max_checkpoints > 0:
+                self._cleanup_old_checkpoints(max_checkpoints)
+                
+        self.accelerator.wait_for_everyone()
+    
+    def _cleanup_old_checkpoints(self, max_checkpoints: int):
+        """删除旧的checkpoint，只保留最近的N个"""
+        # 只在主进程中执行，避免多进程竞态条件
+        if not self.accelerator.is_main_process:
+            return
+        
+        # 获取所有checkpoint文件
+        checkpoint_pattern = os.path.join(self.checkpoint_dir, "steps_*_pytorch_model.pt")
+        checkpoint_files = glob.glob(checkpoint_pattern)
+        
+        if len(checkpoint_files) <= max_checkpoints:
+            return
+        
+        # 从文件名中提取步数，并按步数排序
+        def extract_steps(filepath):
+            match = re.search(r'steps_(\d+)_pytorch_model\.pt', filepath)
+            return int(match.group(1)) if match else 0
+        
+        checkpoint_files.sort(key=extract_steps)
+        
+        # 删除最旧的checkpoint
+        files_to_delete = checkpoint_files[:-max_checkpoints]
+        for filepath in files_to_delete:
+            try:
+                os.remove(filepath)
+                self.accelerator.print(f"🗑️  Deleted old checkpoint: {os.path.basename(filepath)}")
+            except Exception as e:
+                self.accelerator.print(f"⚠️  Failed to delete checkpoint {filepath}: {e}")
+
+    def _log_metrics(self, metrics):
+        """record training metrics"""
+        if self.completed_steps % self.config.trainer.logging_frequency == 0:
+            if dist.get_rank() == 0:
+                # add learning rate
+                metrics["learning_rate"] = self.lr_scheduler.get_last_lr()[
+                    0]  # see lr group in yaml.trainer.learning_rate
+
+                # add epoch info
+                metrics["epoch"] = round(self.completed_steps / len(self.vla_train_dataloader), 2)
+
+                # record to W&B
+                # wandb.log(metrics, step=self.completed_steps)
+                # debug output
+                logger.info(f"\nStep {self.completed_steps}, Loss: {metrics})")
+
+    def _create_data_iterators(self):
+        """create data iterators"""
+        self.vla_iter = iter(self.vla_train_dataloader)
+        # self.vlm_iter = iter(self.vlm_train_dataloader)
+
+    def _get_next_batch(self):
+        """get next batch (automatically handle data loop)"""
+        try:
+            batch_vla = next(self.vla_iter)
+        except StopIteration:
+            if not hasattr(self, "vla_epoch_count"):
+                self.vla_epoch_count = 0
+            self.vla_iter, self.vla_epoch_count = TrainerUtils._reset_dataloader(
+                self.vla_train_dataloader, self.vla_epoch_count
+            )
+            batch_vla = next(self.vla_iter)
+
+        return batch_vla
+
+    def train(self):
+        """execute training loop"""
+        # print training config
+        self._log_training_config()
+
+        # prepare data iterators
+        self._create_data_iterators()
+
+        # create progress bar
+        progress_bar = tqdm(
+            range(self.config.trainer.max_train_steps), disable=not self.accelerator.is_local_main_process
+        )
+
+        # main training loop
+        while self.completed_steps < self.config.trainer.max_train_steps:
+            # get data batch
+            t_start_data = time.perf_counter()
+            batch_vla = self._get_next_batch()
+            self._debug_print_first_batch(batch_vla)
+            t_end_data = time.perf_counter()
+
+            # execute training step
+            t_start_model = time.perf_counter()
+            step_metrics = self._train_step(batch_vla)
+            t_end_model = time.perf_counter()
+
+            # update progress
+            if self.accelerator.sync_gradients:
+                progress_bar.update(1)
+                self.completed_steps += 1
+
+            if self.accelerator.is_local_main_process:
+                progress_bar.set_postfix(
+                    {
+                        "data_times": f"{t_end_data - t_start_data:.3f}",
+                        "model_times": f"{t_end_model - t_start_model:.3f}",
+                    }
+                )
+
+            # evaluate model (predict action once and compute MAE)
+            eval_interval = getattr(self.config.trainer, "eval_interval", 0)
+            if eval_interval > 0 and self.completed_steps > 0 and self.completed_steps % eval_interval == 0:
+                step_metrics = self.eval_action_model(step_metrics)
+
+            # record metrics
+            step_metrics["data_time"] = t_end_data - t_start_data
+            step_metrics["model_time"] = t_end_model - t_start_model
+            self._log_metrics(step_metrics)
+
+            # save checkpoint
+            if self.completed_steps % self.config.trainer.save_interval == 0 and self.completed_steps > 0:
+                self._save_checkpoint()
+
+            # check termination condition
+            if self.completed_steps >= self.config.trainer.max_train_steps:
+                break
+
+        # training end processing
+        self._finalize_training()
+
+        # execute evaluation step
+
+    def eval_action_model(self, step_metrics: dict = None):
+        """
+        Evaluate action model: encode -> decode one batch, then compute MAE (L1) between
+        predicted and ground-truth actions. Compatible with ActionModelFM (encode_actions + decode_actions).
+        """
+        if step_metrics is None:
+            step_metrics = {}
+        examples = self._get_next_batch()
+        device = next(self.model.parameters()).device
+        batch_size = len(examples)
+        # Use same chunk length for all samples (min over batch, capped by config)
+        max_chunk = getattr(
+            self.model.config, "max_action_chunk_size", 50
+        )
+        chunk_len = min(max_chunk, min(len(ex["action"]) for ex in examples))
+        if chunk_len < 1:
+            dist.barrier()
+            return step_metrics
+        # (B, L, D)
+        param_dtype = next(self.model.parameters()).dtype
+        
+        raw_actions = np.array([ex["action"][:chunk_len] for ex in examples])
+        actions_tensor = torch.tensor(raw_actions, device=device, dtype=param_dtype)  # [B, L, D]
+
+        use_state = self.model.use_state
+        if use_state:
+            states_tensor = torch.tensor(
+                np.array([ex["state"][:chunk_len] for ex in examples]),
+                device=device,
+                dtype=param_dtype,
+            )  # [B, L, state_dim]
+        else:
+            states_tensor = None
+
+        dataset_ids = [ex.get("dataset_id") for ex in examples]
+
+        with torch.no_grad():
+            action_embedding = self.model.encode_actions(actions_tensor, dataset_ids, states_tensor)
+            pred_actions = self.model.decode_actions(action_embedding, chunk_size=chunk_len)
+
+        pred_np = pred_actions.cpu().float().numpy()
+        gt_np = raw_actions
+
+        if self.accelerator.is_main_process:
+            score = TrainerUtils.l1_distance(pred_np, gt_np)
+            num_elements = pred_np.size
+            mae_score = score / max(num_elements, 1)
+            step_metrics["mae_score"] = float(mae_score)
+
+        del examples, actions_tensor, action_embedding, pred_actions
+        dist.barrier()
+        return step_metrics
+
+    def _log_training_config(self):
+        """record training config"""
+        if self.accelerator.is_main_process:
+            logger.info("***** Training Configuration *****")
+            logger.info(f"  Total optimization steps = {self.config.trainer.max_train_steps}")
+            logger.info(f"  Per device batch size = {self.config.datasets.vla_data.per_device_batch_size}")
+            logger.info(f"  Gradient accumulation steps = {self.config.trainer.gradient_accumulation_steps}")
+            logger.info(f"  Total batch size = {self.total_batch_size}")
+            
+            logger.info("***** LR Scheduler Debug Info *****")
+            logger.info(f"  lr_scheduler type = {type(self.lr_scheduler)}")
+            base_scheduler = getattr(self.lr_scheduler, 'scheduler', self.lr_scheduler)
+            logger.info(f"  base_scheduler type = {type(base_scheduler)}")
+            logger.info(f"  initial last_epoch = {getattr(base_scheduler, 'last_epoch', 'N/A')}")
+            logger.info(f"  initial lr = {self.lr_scheduler.get_last_lr()}")
+            logger.info(f"  num_warmup_steps = {self.config.trainer.num_warmup_steps}")
+            logger.info(f"  num_stable_steps = {self.config.trainer.get('num_stable_steps', 0)}")
+            logger.info(f"  max_train_steps = {self.config.trainer.max_train_steps}")
+            logger.info(f"  accelerator.num_processes = {self.accelerator.num_processes}")
+            logger.info(f"  accelerator.gradient_accumulation_steps = {self.accelerator.gradient_accumulation_steps}")
+
+    def _train_step(self, batch_vla, batch_vlm=None):
+        """execute single training step"""
+        with self.accelerator.accumulate(self.model):
+            self.optimizer.zero_grad()
+
+            # VLA task forward propagation
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                recon_loss = self.model.forward(batch_vla)
+
+            # VLA backward propagation
+            self.accelerator.backward(recon_loss)
+
+            # gradient clipping
+            grad_norm = None
+            if self.config.trainer.gradient_clipping is not None:
+                grad_norm = self.accelerator.clip_grad_norm_(
+                    self.model.parameters(), self.config.trainer.gradient_clipping
+                )
+
+            # optimizer step
+            self.optimizer.step()
+        
+        if self.accelerator.sync_gradients:
+            self.lr_scheduler.step()
+
+        step_metrics = {
+            "recon_loss": recon_loss.item(),
+        }
+        if grad_norm is not None:
+            step_metrics["grad_norm"] = grad_norm.item() if hasattr(grad_norm, "item") else float(grad_norm)
+        return step_metrics
+
+    def _finalize_training(self):
+        """training end processing"""
+        # save final model
+        if self.accelerator.is_main_process:
+            final_checkpoint = os.path.join(self.config.output_dir, "final_model")
+            os.makedirs(final_checkpoint, exist_ok=True)
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, os.path.join(final_checkpoint, "pytorch_model.pt"))
+            logger.info(f"Training complete. Final model saved at {final_checkpoint}")
+
+        # close W&B
+        if self.accelerator.is_main_process:
+            # wandb.finish()
+            pass
+
+        self.accelerator.wait_for_everyone()
+
+
+def main(cfg) -> None:
+    logger.info("VLA Training :: Warming Up")
+
+    # create output directory and save config
+    output_dir = setup_directories(cfg=cfg)
+    # build model
+    vla = build_framework(cfg)
+    # prepare data
+    vla_train_dataloader = prepare_data(cfg=cfg, accelerator=accelerator, output_dir=output_dir)
+
+    # create trainer
+    # Run VLA Training
+    trainer = VLATrainer(
+        cfg=cfg,
+        model=vla,
+        vla_train_dataloader=vla_train_dataloader,
+        optimizer=None,
+        lr_scheduler=None,
+        accelerator=accelerator,
+    )
+
+    # execute training preparation
+    trainer.prepare_training()
+    # execute training
+    trainer.train()
+
+    # And... we're done!
+    logger.info("... and that's all, folks!")
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="starVLA/config/training/starvla_cotrain_oxe.yaml",
+                        help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    # Load YAML config & Convert CLI overrides to dotlist config
+    cfg = OmegaConf.load(args.config_yaml)
+    dotlist = normalize_dotlist_args(clipargs)  # Normalize CLI args to dotlist format
+    cli_cfg = OmegaConf.from_dotlist(dotlist)
+    cfg = OmegaConf.merge(cfg, cli_cfg)
+
+    # if cfg.is_debug:
+    if cfg.is_debug and dist.is_initialized() and dist.get_rank() == 0:
+        import debugpy
+
+        debugpy.listen(("0.0.0.0", 10092))
+        print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+        debugpy.wait_for_client()
+
+    main(cfg)
diff --git a/code/training/train_pi0.py b/code/training/train_pi0.py
new file mode 100644
index 0000000000000000000000000000000000000000..0016749da705bb1b30aed783554c698bde75d40b
--- /dev/null
+++ b/code/training/train_pi0.py
@@ -0,0 +1,445 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License");
+# Implemented for PI0 Framework training with unified action representation.
+"""
+PI0 Trainer
+参考 train_qwenlatent.py，用于训练 PI0 模型。
+支持：
+  - 从 pi0 预训练 checkpoint 加载权重
+  - 使用 unified 37D action 表示（框架内截断到 PI0 所需的 32D）
+  - 与 lerobot_datasets 兼容
+"""
+import sys
+sys.path.append("/mnt/data/fangyu/code/reward_new")
+
+import warnings
+warnings.filterwarnings("ignore")
+
+import argparse
+import json
+import os
+import glob
+import re
+import time
+from pathlib import Path
+from typing import Tuple
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import wandb
+import yaml
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+from transformers import get_scheduler
+
+from starVLA.training.trainer_utils.trainer_tools import normalize_dotlist_args
+from starVLA.model.framework import build_framework
+from starVLA.training.trainer_utils.trainer_tools import TrainerUtils
+from starVLA.training.trainer_utils.trainer_tools import build_param_lr_groups
+from starVLA.dataloader import build_dataloader
+
+# WANDB key - can be overridden by env
+os.environ.setdefault("WANDB_API_KEY", "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")
+
+deepspeed_plugin = DeepSpeedPlugin()
+accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
+accelerator.print(accelerator.state)
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+logger = get_logger(__name__)
+
+
+def setup_directories(cfg) -> Path:
+    """Create output directory and save config."""
+    cfg.output_dir = os.path.join(cfg.run_root_dir, cfg.run_id)
+    output_dir = Path(cfg.output_dir)
+
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(output_dir / "checkpoints", exist_ok=True)
+        OmegaConf.save(cfg, output_dir / "config.yaml")
+        with open(output_dir / "config.yaml", "r") as f_yaml, open(output_dir / "config.json", "w") as f_json:
+            yaml_cfg = yaml.safe_load(f_yaml)
+            json.dump(yaml_cfg, f_json, indent=2)
+
+    return output_dir
+
+
+def prepare_data(cfg, accelerator, output_dir) -> DataLoader:
+    """Prepare training data."""
+    logger.info(f"Creating VLA Dataset with Mixture `{cfg.datasets.vla_data.data_mix}`")
+    vla_train_dataloader = build_dataloader(cfg=cfg, dataset_py=cfg.datasets.vla_data.dataset_py)
+
+    accelerator.dataloader_config.dispatch_batches = False
+    accelerator.wait_for_everyone()
+
+    return vla_train_dataloader
+
+
+def get_warmup_stable_cosine_scheduler(optimizer, num_warmup_steps, num_stable_steps, num_training_steps, min_lr_ratio=0.01):
+    """Warmup → Stable → Cosine Decay scheduler."""
+    import math
+
+    def lr_lambda(current_step):
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        stable_end = num_warmup_steps + num_stable_steps
+        if current_step < stable_end:
+            return 1.0
+        decay_steps = num_training_steps - stable_end
+        if decay_steps <= 0:
+            return min_lr_ratio
+        progress = float(current_step - stable_end) / float(decay_steps)
+        return min_lr_ratio + (1.0 - min_lr_ratio) * 0.5 * (1.0 + math.cos(math.pi * progress))
+
+    num_param_groups = len(optimizer.param_groups)
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, [lr_lambda] * num_param_groups)
+
+
+def setup_optimizer_and_scheduler(model, cfg) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]:
+    """Set optimizer and scheduler."""
+    param_groups = build_param_lr_groups(model=model, cfg=cfg)
+    optimizer = torch.optim.AdamW(
+        param_groups,
+        lr=cfg.trainer.learning_rate.base,
+        betas=tuple(cfg.trainer.optimizer.betas),
+        weight_decay=cfg.trainer.optimizer.weight_decay,
+        eps=cfg.trainer.optimizer.eps,
+    )
+
+    if dist.is_initialized() and dist.get_rank() == 0:
+        for i, group in enumerate(optimizer.param_groups):
+            logger.info(f"LR Group {group['name']}: lr={group['lr']}, num_params={len(group['params'])}")
+
+    if cfg.trainer.lr_scheduler_type == "warmup_stable_cosine":
+        min_lr_ratio = cfg.trainer.scheduler_specific_kwargs.get("min_lr_ratio", 0.01)
+        num_stable_steps = cfg.trainer.get("num_stable_steps", 0)
+        lr_scheduler = get_warmup_stable_cosine_scheduler(
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_stable_steps=num_stable_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+            min_lr_ratio=min_lr_ratio,
+        )
+    else:
+        lr_scheduler = get_scheduler(
+            name=cfg.trainer.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+            scheduler_specific_kwargs=cfg.trainer.get("scheduler_specific_kwargs"),
+        )
+
+    return optimizer, lr_scheduler
+
+
+class PI0Trainer(TrainerUtils):
+    """Trainer for PI0 Framework."""
+
+    def __init__(self, cfg, model, vla_train_dataloader, optimizer, lr_scheduler, accelerator):
+        self.config = cfg
+        self.model = model
+        self.vla_train_dataloader = vla_train_dataloader
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.accelerator = accelerator
+        self._printed_first_batch = False
+
+        self.completed_steps = 0
+        self.total_batch_size = (
+            self.config.datasets.vla_data.per_device_batch_size
+            * self.accelerator.num_processes
+            * self.accelerator.gradient_accumulation_steps
+        )
+
+    def _debug_print_first_batch(self, batch) -> None:
+        """Print first batch structure for debugging (only once, on local main process)."""
+        if self._printed_first_batch or not self.accelerator.is_local_main_process:
+            return
+        self._printed_first_batch = True
+
+        sample = None
+        if isinstance(batch, list):
+            sample = batch[0] if len(batch) > 0 else None
+        elif isinstance(batch, dict):
+            sample = batch
+
+        if sample is None:
+            self.accelerator.print("First batch is empty.")
+            return
+
+        def _describe_value(value):
+            if hasattr(value, "shape"):
+                try:
+                    return f"{type(value).__name__}(shape={tuple(value.shape)})"
+                except Exception:
+                    return type(value).__name__
+            if isinstance(value, list):
+                inner = type(value[0]).__name__ if value else "empty"
+                return f"list(len={len(value)}, inner={inner})"
+            return type(value).__name__
+
+        self.accelerator.print(f"[PI0Trainer] First batch type: {type(batch).__name__}, "
+                               f"size: {len(batch) if isinstance(batch, list) else 1}")
+        self.accelerator.print("[PI0Trainer] First sample keys:")
+        for key, value in sample.items():
+            self.accelerator.print(f"  - {key}: {_describe_value(value)}")
+
+    def prepare_training(self):
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        seed = self.config.seed + rank if hasattr(self.config, "seed") else rank + 3047
+        set_seed(seed)
+
+        # Load pretrained checkpoint if specified (trainer.pretrained_checkpoint is for
+        # resuming starVLA training; pi0_checkpoint in framework config loads pi0 base weights)
+        if hasattr(self.config.trainer, "pretrained_checkpoint") and self.config.trainer.pretrained_checkpoint:
+            pretrained_checkpoint = self.config.trainer.pretrained_checkpoint
+            self.model = self.load_pretrained_backbones(
+                self.model, pretrained_checkpoint,
+                reload_modules=getattr(self.config.trainer, "reload_modules", None)
+            )
+
+        self.print_trainable_parameters(self.model)
+
+        self.optimizer, self.lr_scheduler = setup_optimizer_and_scheduler(model=self.model, cfg=self.config)
+
+        self.model, self.optimizer, self.vla_train_dataloader = self.setup_distributed_training(
+            self.accelerator,
+            self.model,
+            self.optimizer,
+            self.vla_train_dataloader,
+        )
+
+        self._init_wandb()
+        self._init_checkpointing()
+
+    def _init_wandb(self):
+        if self.accelerator.is_main_process:
+            wandb.init(
+                name=self.config.run_id,
+                dir=os.path.join(self.config.output_dir, "wandb"),
+                project=self.config.wandb_project,
+                entity=self.config.wandb_entity,
+                group="pi0-train",
+            )
+
+    def _init_checkpointing(self):
+        self.checkpoint_dir = os.path.join(self.config.output_dir, "checkpoints")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+
+        if getattr(self.config.trainer, "is_resume", False) and getattr(self.config.trainer, "resume_from_checkpoint", None):
+            self.accelerator.load_state(self.config.trainer.resume_from_checkpoint)
+            self.accelerator.print(f"Resumed from checkpoint: {self.config.trainer.resume_from_checkpoint}")
+
+    def _save_checkpoint(self):
+        if self.accelerator.is_main_process:
+            checkpoint_path = os.path.join(self.checkpoint_dir, f"steps_{self.completed_steps}")
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, checkpoint_path + "_pytorch_model.pt")
+
+            with open(os.path.join(self.config.output_dir, "summary.jsonl"), "a") as f:
+                f.write(json.dumps({"steps": self.completed_steps}) + "\n")
+            self.accelerator.print(f"✅ Checkpoint saved at {checkpoint_path}")
+
+            max_checkpoints = getattr(self.config.trainer, "max_checkpoints_to_keep", None)
+            if max_checkpoints and max_checkpoints > 0:
+                self._cleanup_old_checkpoints(max_checkpoints)
+
+        self.accelerator.wait_for_everyone()
+
+    def _cleanup_old_checkpoints(self, max_checkpoints: int):
+        if not self.accelerator.is_main_process:
+            return
+        checkpoint_pattern = os.path.join(self.checkpoint_dir, "steps_*_pytorch_model.pt")
+        checkpoint_files = glob.glob(checkpoint_pattern)
+        if len(checkpoint_files) <= max_checkpoints:
+            return
+
+        def extract_steps(filepath):
+            match = re.search(r'steps_(\d+)_pytorch_model\.pt', filepath)
+            return int(match.group(1)) if match else 0
+
+        checkpoint_files.sort(key=extract_steps)
+        for filepath in checkpoint_files[:-max_checkpoints]:
+            try:
+                os.remove(filepath)
+                self.accelerator.print(f"🗑️  Deleted old checkpoint: {os.path.basename(filepath)}")
+            except Exception as e:
+                self.accelerator.print(f"⚠️  Failed to delete {filepath}: {e}")
+
+    def _log_metrics(self, metrics):
+        if self.completed_steps % self.config.trainer.logging_frequency == 0:
+            # Guard against non-distributed single-process runs
+            is_main = not dist.is_initialized() or dist.get_rank() == 0
+            if is_main:
+                metrics["learning_rate"] = self.lr_scheduler.get_last_lr()[0]
+                metrics["epoch"] = round(self.completed_steps / max(len(self.vla_train_dataloader), 1), 2)
+                wandb.log(metrics, step=self.completed_steps)
+                logger.info(f"\nStep {self.completed_steps}, Loss: {metrics}")
+
+    def _create_data_iterators(self):
+        self.vla_iter = iter(self.vla_train_dataloader)
+
+    def _get_next_batch(self):
+        try:
+            batch_vla = next(self.vla_iter)
+        except StopIteration:
+            if not hasattr(self, "vla_epoch_count"):
+                self.vla_epoch_count = 0
+            self.vla_iter, self.vla_epoch_count = TrainerUtils._reset_dataloader(
+                self.vla_train_dataloader, self.vla_epoch_count
+            )
+            batch_vla = next(self.vla_iter)
+        return batch_vla
+
+    def train(self):
+        self._log_training_config()
+        self._create_data_iterators()
+
+        progress_bar = tqdm(
+            range(self.config.trainer.max_train_steps),
+            disable=not self.accelerator.is_local_main_process
+        )
+
+        while self.completed_steps < self.config.trainer.max_train_steps:
+            t_start_data = time.perf_counter()
+            batch_vla = self._get_next_batch()
+            self._debug_print_first_batch(batch_vla)
+            t_end_data = time.perf_counter()
+
+            t_start_model = time.perf_counter()
+            step_metrics = self._train_step(batch_vla)
+            t_end_model = time.perf_counter()
+
+            if self.accelerator.sync_gradients:
+                progress_bar.update(1)
+                self.completed_steps += 1
+
+            if self.accelerator.is_local_main_process:
+                progress_bar.set_postfix({
+                    "data_t": f"{t_end_data - t_start_data:.3f}s",
+                    "model_t": f"{t_end_model - t_start_model:.3f}s",
+                    "loss": f"{step_metrics.get('action_loss', 0):.4f}",
+                })
+
+            step_metrics["data_time"] = t_end_data - t_start_data
+            step_metrics["model_time"] = t_end_model - t_start_model
+            self._log_metrics(step_metrics)
+
+            if self.completed_steps % self.config.trainer.save_interval == 0 and self.completed_steps > 0:
+                self._save_checkpoint()
+
+            if self.completed_steps >= self.config.trainer.max_train_steps:
+                break
+
+        self._finalize_training()
+
+    def _log_training_config(self):
+        if self.accelerator.is_main_process:
+            logger.info("***** PI0 Training Configuration *****")
+            logger.info(f"  Total steps = {self.config.trainer.max_train_steps}")
+            logger.info(f"  Per device batch size = {self.config.datasets.vla_data.per_device_batch_size}")
+            logger.info(f"  Total batch size (global) = {self.total_batch_size}")
+            logger.info(f"  Gradient accumulation steps = {self.accelerator.gradient_accumulation_steps}")
+            logger.info(f"  Num processes = {self.accelerator.num_processes}")
+            pi0_cfg = getattr(self.config.framework, "pi0", None)
+            if pi0_cfg is not None:
+                logger.info(f"  PI0 action_dim = {getattr(pi0_cfg, 'action_dim', 'N/A')}  "
+                            f"(dataset 37D unified actions will be truncated to this dim)")
+                logger.info(f"  PI0 action_horizon = {getattr(pi0_cfg, 'action_horizon', 'N/A')}")
+                logger.info(f"  PI0 pi05 = {getattr(pi0_cfg, 'pi05', 'N/A')}")
+
+    def _train_step(self, batch_vla):
+        with self.accelerator.accumulate(self.model):
+            self.optimizer.zero_grad()
+
+            # PI0Framework.forward handles autocast internally (bfloat16 for PI0Pytorch);
+            # do NOT wrap again here to avoid interfering with internal precision management.
+            output_dict = self.model.forward(batch_vla)
+            action_loss = output_dict["action_loss"]
+            total_loss = action_loss
+
+            self.accelerator.backward(total_loss)
+
+            grad_norm = None
+            if self.config.trainer.gradient_clipping is not None:
+                grad_norm = self.accelerator.clip_grad_norm_(
+                    self.model.parameters(), self.config.trainer.gradient_clipping
+                )
+
+            self.optimizer.step()
+
+        if self.accelerator.sync_gradients:
+            self.lr_scheduler.step()
+
+        step_metrics = {"action_loss": action_loss.item()}
+        if grad_norm is not None:
+            step_metrics["grad_norm"] = grad_norm.item() if hasattr(grad_norm, "item") else float(grad_norm)
+        return step_metrics
+
+    def _finalize_training(self):
+        if self.accelerator.is_main_process:
+            final_checkpoint = os.path.join(self.config.output_dir, "final_model")
+            os.makedirs(final_checkpoint, exist_ok=True)
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, os.path.join(final_checkpoint, "pytorch_model.pt"))
+            logger.info(f"Training complete. Final model saved at {final_checkpoint}")
+
+        if self.accelerator.is_main_process:
+            wandb.finish()
+
+        self.accelerator.wait_for_everyone()
+
+
+def main(cfg) -> None:
+    logger.info("PI0 Training :: Warming Up")
+
+    output_dir = setup_directories(cfg=cfg)
+    model = build_framework(cfg)
+    vla_train_dataloader = prepare_data(cfg=cfg, accelerator=accelerator, output_dir=output_dir)
+
+    trainer = PI0Trainer(
+        cfg=cfg,
+        model=model,
+        vla_train_dataloader=vla_train_dataloader,
+        optimizer=None,
+        lr_scheduler=None,
+        accelerator=accelerator,
+    )
+
+    trainer.prepare_training()
+    trainer.train()
+
+    logger.info("... and that's all, folks!")
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_yaml",
+        type=str,
+        default="starVLA/config/training/starvla_train_pi0.yaml",
+        help="Path to YAML config",
+    )
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    dotlist = normalize_dotlist_args(clipargs)
+    cli_cfg = OmegaConf.from_dotlist(dotlist)
+    cfg = OmegaConf.merge(cfg, cli_cfg)
+
+    if getattr(cfg, "is_debug", False) and dist.is_initialized() and dist.get_rank() == 0:
+        import debugpy
+        debugpy.listen(("0.0.0.0", 10092))
+        print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+        debugpy.wait_for_client()
+
+    main(cfg)
diff --git a/code/training/train_qwengr00t.py b/code/training/train_qwengr00t.py
new file mode 100644
index 0000000000000000000000000000000000000000..85300becde43cf5f5928a7fa36200f5e291b0066
--- /dev/null
+++ b/code/training/train_qwengr00t.py
@@ -0,0 +1,135 @@
+import argparse
+import torch
+import numpy as np
+import torch.distributed as dist
+from accelerate.utils import DistributedType
+from omegaconf import OmegaConf
+
+from starVLA.training.trainer_utils.trainer_tools import normalize_dotlist_args, TrainerUtils
+from starVLA.model.framework import build_framework
+from starVLA.training.train_qwenlatent import (
+    accelerator,
+    logger,
+    setup_directories,
+    prepare_data,
+    VLATrainer,
+)
+
+
+class QwenGR00TTrainer(VLATrainer):
+    def _train_step(self, batch_vla, batch_vlm=None):
+        """Execute one training step for QwenGR00T (single `action_loss`)."""
+        is_deepspeed = self.accelerator.distributed_type == DistributedType.DEEPSPEED
+        grad_norm_pre_clip = None
+
+        with self.accelerator.accumulate(self.model):
+            self.optimizer.zero_grad()
+
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_dict = self.model.forward(batch_vla, training_step=self.completed_steps)
+                action_loss = output_dict["action_loss"]
+                total_loss = action_loss
+
+            self.accelerator.backward(total_loss)
+
+            # For non-DeepSpeed: clip explicitly and capture pre-clip norm.
+            # For DeepSpeed: gradient clipping is handled internally during optimizer.step(),
+            # so we skip it here and retrieve the norm after step() below.
+            if not is_deepspeed:
+                gc = getattr(self.config.trainer, "gradient_clipping", None)
+                max_norm = float(gc) if gc is not None else float("inf")
+                grad_norm_pre_clip = self.accelerator.clip_grad_norm_(
+                    self.model.parameters(), max_norm
+                )
+                if grad_norm_pre_clip is None:
+                    grad_norm_pre_clip = self._total_grad_norm_l2_local(self.model.parameters())
+
+            self.optimizer.step()
+
+        if self.accelerator.sync_gradients:
+            self.lr_scheduler.step()
+
+            # For DeepSpeed: read the global grad norm populated by optimizer.step().
+            if is_deepspeed:
+                gn = getattr(self.model, "_global_grad_norm", None)
+                if gn is None:
+                    gn = self.accelerator.clip_grad_norm_(self.model.parameters(), float("inf"))
+                grad_norm_pre_clip = gn
+
+        gn_scalar = self._grad_norm_scalar(grad_norm_pre_clip)
+        self._grad_norm_buffer.append(gn_scalar)
+
+        step_metrics = {
+            "action_loss": action_loss.item(),
+            "grad_norm_pre_clip": gn_scalar,
+        }
+        return step_metrics
+
+    def eval_action_model(self, step_metrics: dict = None, examples=None) -> float:
+        """
+        Evaluate MAE for QwenGR00T using predicted horizon length.
+        """
+        if examples is None:
+            examples = self._get_next_batch()
+
+        output_dict = self.model.predict_action(examples=examples)
+
+        if self.accelerator.is_main_process:
+            normalized_actions = output_dict["normalized_actions"]  # [B, T_pred, D]
+            pred_horizon = normalized_actions.shape[1]
+
+            # QwenGR00T forward trains on the last future window (`[-pred_horizon:]`)
+            actions = [example["action"][-pred_horizon:] for example in examples]
+            actions = np.array(actions)
+
+            num_points = np.prod(actions.shape)
+            score = TrainerUtils.l1_distance(normalized_actions, actions)
+            average_score = score / num_points
+            step_metrics["mae_score"] = average_score
+
+        del examples
+        if dist.is_initialized():
+            dist.barrier()
+        return step_metrics
+
+
+def main(cfg) -> None:
+    logger.info("QwenGR00T Training :: Warming Up")
+
+    output_dir = setup_directories(cfg=cfg)
+    vla = build_framework(cfg)
+    vla_train_dataloader = prepare_data(cfg=cfg, accelerator=accelerator, output_dir=output_dir)
+
+    trainer = QwenGR00TTrainer(
+        cfg=cfg,
+        model=vla,
+        vla_train_dataloader=vla_train_dataloader,
+        optimizer=None,
+        lr_scheduler=None,
+        accelerator=accelerator,
+    )
+    trainer.prepare_training()
+    trainer.train()
+
+    logger.info("QwenGR00T training finished.")
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_yaml",
+        type=str,
+        default="starVLA/config/training/starvla_train_qwengr00t_oxe.yaml",
+        help="Path to YAML config",
+    )
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    dotlist = normalize_dotlist_args(clipargs)
+    cli_cfg = OmegaConf.from_dotlist(dotlist)
+    cfg = OmegaConf.merge(cfg, cli_cfg)
+
+    main(cfg)
diff --git a/code/training/train_qwenlatent.py b/code/training/train_qwenlatent.py
new file mode 100644
index 0000000000000000000000000000000000000000..1808eaecd2fd94cffe4b31ab0420bca429895e7d
--- /dev/null
+++ b/code/training/train_qwenlatent.py
@@ -0,0 +1,795 @@
+# Copyright 2025 starVLA community. All rights reserved.
+# Licensed under the MIT License, Version 1.0 (the "License"); 
+# Implemented by [Jinhui YE / HKUST University] in [2025].
+import sys
+sys.path.append("/mnt/data/fangyu/code/reward_new")
+
+"""
+StarVLA’s trainer is built directly on native PyTorch + Accelerate + DeepSpeed, keeping the loop explicit and easy to hack.
+Conventions:
+1. Store runtime state in dicts where possible (simplifies data info, procesing info, config, etc).  
+2. Use multiple dataloaders to adapt heterogeneous data types / task mixtures.  
+3. Put each training strategy in its own `trainer_*.py` file (avoid large if‑else chains).  
+"""
+import warnings
+warnings.filterwarnings("ignore")
+
+# Standard Library
+import argparse
+import json
+import os
+os.environ["WANDB_API_KEY"] = "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"
+from pathlib import Path
+from typing import Tuple
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import time
+import glob
+import re
+# Third-Party Libraries
+import torch
+import torch.distributed as dist
+import wandb
+import yaml
+from accelerate import Accelerator, DeepSpeedPlugin
+from accelerate.logging import get_logger
+from accelerate.utils import set_seed, DistributedType
+from omegaconf import OmegaConf
+from tqdm import tqdm
+from transformers import AutoProcessor, get_scheduler
+
+# Local Modules
+from starVLA.training.trainer_utils.trainer_tools import normalize_dotlist_args
+from starVLA.model.framework import build_framework
+from starVLA.training.trainer_utils.trainer_tools import TrainerUtils
+from starVLA.training.trainer_utils.trainer_tools import build_param_lr_groups
+
+deepspeed_plugin = DeepSpeedPlugin()
+accelerator = Accelerator(deepspeed_plugin=deepspeed_plugin)
+accelerator.print(accelerator.state)
+
+# Sane Defaults
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+# Initialize Overwatch =>> Wraps `logging.Logger`
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+def load_fast_tokenizer():
+    fast_tokenizer = AutoProcessor.from_pretrained("physical-intelligence/fast", trust_remote_code=True)
+    return fast_tokenizer
+
+
+def setup_directories(cfg) -> Path:
+    """create output directory and save config"""
+    cfg.output_dir = os.path.join(cfg.run_root_dir, cfg.run_id)
+    output_dir = Path(cfg.output_dir)
+
+    if not dist.is_initialized() or dist.get_rank() == 0:
+        # create output directory and checkpoint directory
+        os.makedirs(output_dir, exist_ok=True)
+        os.makedirs(output_dir / "checkpoints", exist_ok=True)
+
+        # save config
+        OmegaConf.save(cfg, output_dir / "config.yaml")
+        with open(output_dir / "config.yaml", "r") as f_yaml, open(output_dir / "config.json", "w") as f_json:
+            yaml_cfg = yaml.safe_load(f_yaml)
+            json.dump(yaml_cfg, f_json, indent=2)
+
+    return output_dir
+
+
+def build_model(cfg) -> torch.nn.Module:
+    """build model framework"""
+    logger.info(f"Loading Base VLM `{cfg.framework.qwenvl.base_vlm}` from ID/Path")
+    model = build_framework(cfg)
+
+    return model
+
+
+# here changes need to 📦 encapsulate Dataloader
+from starVLA.dataloader import build_dataloader
+
+
+def prepare_data(cfg, accelerator, output_dir) -> Tuple[DataLoader, DataLoader]:
+    """prepare training data"""
+    # VLA data loader
+    logger.info(f"Creating VLA Dataset with Mixture `{cfg.datasets.vla_data.data_mix}`")
+    vla_train_dataloader = build_dataloader(cfg=cfg, dataset_py=cfg.datasets.vla_data.dataset_py)
+
+    accelerator.dataloader_config.dispatch_batches = False
+    dist.barrier()
+
+    return vla_train_dataloader
+
+
+def get_warmup_stable_cosine_scheduler(optimizer, num_warmup_steps, num_stable_steps, num_training_steps, min_lr_ratio=0.01):
+    """
+    Warmup → Stable → Cosine Decay scheduler
+    
+    Args:
+        optimizer: PyTorch optimizer
+        num_warmup_steps: warmup 阶段步数
+        num_stable_steps: 保持 max_lr 的步数（在 warmup 之后）
+        num_training_steps: 总训练步数
+        min_lr_ratio: 最终 lr / max_lr 的比例
+    
+    Returns:
+        LambdaLR scheduler
+    """
+    import math
+    
+    def lr_lambda(current_step):
+        # Warmup 阶段：线性增长
+        if current_step < num_warmup_steps:
+            return float(current_step) / float(max(1, num_warmup_steps))
+        
+        # Stable 阶段：保持 max_lr
+        stable_end = num_warmup_steps + num_stable_steps
+        if current_step < stable_end:
+            return 1.0
+        
+        # Cosine decay 阶段
+        decay_steps = num_training_steps - stable_end
+        if decay_steps <= 0:
+            return min_lr_ratio
+        progress = float(current_step - stable_end) / float(decay_steps)
+        return min_lr_ratio + (1.0 - min_lr_ratio) * 0.5 * (1.0 + math.cos(math.pi * progress))
+    
+    # 为每个参数组提供相同的 lr_lambda（支持多参数组优化器）
+    num_param_groups = len(optimizer.param_groups)
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, [lr_lambda] * num_param_groups)
+
+
+def setup_optimizer_and_scheduler(model, cfg) -> Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler._LRScheduler]:
+    """set optimizer and scheduler"""
+    # initialize optimizer
+    param_groups = build_param_lr_groups(model=model, cfg=cfg)
+    optimizer = torch.optim.AdamW(
+        param_groups,
+        lr=cfg.trainer.learning_rate.base,
+        betas=tuple(cfg.trainer.optimizer.betas),
+        weight_decay=cfg.trainer.optimizer.weight_decay,
+        eps=cfg.trainer.optimizer.eps,
+    )
+
+    # print optimizer group info
+    if dist.is_initialized() and dist.get_rank() == 0:
+        for i, group in enumerate(optimizer.param_groups):
+            logger.info(f"LR Group {group['name']}: lr={group['lr']}, num_params={len(group['params'])}")
+
+    # initialize learning rate scheduler
+    if cfg.trainer.lr_scheduler_type == "warmup_stable_cosine":
+        # 自定义 scheduler: Warmup → Stable → Cosine Decay
+        min_lr_ratio = cfg.trainer.scheduler_specific_kwargs.get("min_lr_ratio", 0.01)
+        num_stable_steps = cfg.trainer.get("num_stable_steps", 0)
+        lr_scheduler = get_warmup_stable_cosine_scheduler(
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_stable_steps=num_stable_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+            min_lr_ratio=min_lr_ratio,
+        )
+        if dist.is_initialized() and dist.get_rank() == 0:
+            logger.info(f"Using warmup_stable_cosine scheduler: warmup={cfg.trainer.num_warmup_steps}, "
+                       f"stable={num_stable_steps}, total={cfg.trainer.max_train_steps}, min_lr_ratio={min_lr_ratio}")
+    elif cfg.trainer.lr_scheduler_type == "onecycle":
+        # OneCycleLR: supports multiple param groups with different peak lrs.
+        scheduler_kwargs = cfg.trainer.scheduler_specific_kwargs or {}
+        pct_start = scheduler_kwargs.get("pct_start", None)
+        if pct_start is None:
+            pct_start = float(cfg.trainer.num_warmup_steps) / float(max(1, cfg.trainer.max_train_steps))
+        pct_start = max(0.0, min(1.0, float(pct_start)))
+
+        max_lrs = [group["lr"] for group in optimizer.param_groups]
+        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer=optimizer,
+            max_lr=max_lrs,
+            total_steps=cfg.trainer.max_train_steps,
+            pct_start=pct_start,
+            anneal_strategy=scheduler_kwargs.get("anneal_strategy", "cos"),
+            cycle_momentum=scheduler_kwargs.get("cycle_momentum", False),
+            div_factor=scheduler_kwargs.get("div_factor", 25.0),
+            final_div_factor=scheduler_kwargs.get("final_div_factor", 10000.0),
+            three_phase=scheduler_kwargs.get("three_phase", False),
+        )
+        if dist.is_initialized() and dist.get_rank() == 0:
+            logger.info(
+                "Using onecycle scheduler: total=%s, pct_start=%.6f, max_lrs=%s, anneal=%s, "
+                "div_factor=%s, final_div_factor=%s, cycle_momentum=%s, three_phase=%s",
+                cfg.trainer.max_train_steps,
+                pct_start,
+                max_lrs,
+                scheduler_kwargs.get("anneal_strategy", "cos"),
+                scheduler_kwargs.get("div_factor", 25.0),
+                scheduler_kwargs.get("final_div_factor", 10000.0),
+                scheduler_kwargs.get("cycle_momentum", False),
+                scheduler_kwargs.get("three_phase", False),
+            )
+    else:
+        # 使用 transformers 内置 scheduler
+        lr_scheduler = get_scheduler(
+            name=cfg.trainer.lr_scheduler_type,
+            optimizer=optimizer,
+            num_warmup_steps=cfg.trainer.num_warmup_steps,
+            num_training_steps=cfg.trainer.max_train_steps,
+                scheduler_specific_kwargs=cfg.trainer.scheduler_specific_kwargs,
+        )
+
+    return optimizer, lr_scheduler
+
+
+class VLATrainer(TrainerUtils):
+    def __init__(self, cfg, model, vla_train_dataloader, optimizer, lr_scheduler, accelerator):
+        self.config = cfg
+        self.model = model
+        self.vla_train_dataloader = vla_train_dataloader
+        # Note: optimizer/lr_scheduler are intentionally created in `prepare_training()`
+        # after we load checkpoints and freeze modules, to avoid empty param-groups in
+        # DeepSpeed ZeRO initialization.
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+        self.accelerator = accelerator
+        self._printed_first_batch = False
+
+        # training status tracking
+        self.completed_steps = 0
+        self.total_batch_size = self._calculate_total_batch_size()
+        self._grad_norm_buffer: list[float] = []
+        self.training_mode = getattr(self.config.trainer, "mode", "default")
+        self.loss_weights_decay_steps = int(getattr(self.config.trainer, "loss_weights_decay_steps", 5000))
+        if self.loss_weights_decay_steps <= 0:
+            logger.warning(
+                f"Invalid loss_weights_decay_steps={self.loss_weights_decay_steps}, fallback to 1."
+            )
+            self.loss_weights_decay_steps = 1
+
+    def _debug_print_first_batch(self, batch) -> None:
+        if self._printed_first_batch or not self.accelerator.is_local_main_process:
+            return
+        self._printed_first_batch = True
+
+        sample = None
+        if isinstance(batch, list):
+            sample = batch[0] if len(batch) > 0 else None
+        elif isinstance(batch, dict):
+            sample = batch
+
+        if sample is None:
+            self.accelerator.print("First batch is empty.")
+            return
+
+        def _describe_value(value):
+            if hasattr(value, "shape"):
+                try:
+                    return f"{type(value).__name__}(shape={tuple(value.shape)})"
+                except Exception:
+                    return type(value).__name__
+            if isinstance(value, list):
+                inner = type(value[0]).__name__ if value else "empty"
+                return f"list(len={len(value)}, inner={inner})"
+            return type(value).__name__
+
+        self.accelerator.print(f"First batch type: {type(batch).__name__}")
+        if isinstance(batch, list):
+            self.accelerator.print(f"First batch size: {len(batch)}")
+        self.accelerator.print("First sample keys:")
+        for key, value in sample.items():
+            self.accelerator.print(f"  - {key}: {_describe_value(value)}")
+
+        # Print full content for first 5 samples to inspect inputs.
+        if isinstance(batch, list):
+            max_samples = min(5, len(batch))
+            for i in range(max_samples):
+                self.accelerator.print(f"Sample[{i}] content:")
+                for key, value in batch[i].items():
+                    if hasattr(value, "shape"):
+                        try:
+                            value_str = np.array2string(
+                                value, threshold=np.inf, max_line_width=200
+                            )
+                        except Exception:
+                            value_str = repr(value)
+                    else:
+                        value_str = repr(value)
+                    self.accelerator.print(f"  - {key}: {value_str}")
+
+    def prepare_training(self):
+        rank = dist.get_rank() if dist.is_initialized() else 0
+        seed = self.config.seed + rank if hasattr(self.config, "seed") else rank + 3047
+        set_seed(seed)
+
+        # load pretrained weights
+        # 如果 action_model 已经在 __init__ 中从 ckpt_path 加载了权重，需要保护它不被覆盖
+        action_model_ckpt_path = getattr(self.config.framework.action_model, "ckpt_path", None)
+        if action_model_ckpt_path:
+            # 保存 action_model 的权重用于验证
+            action_model_state_before = {
+                k: v.clone() for k, v in self.model.action_model.state_dict().items()
+            }
+        
+        if hasattr(self.config.trainer, "pretrained_checkpoint") and self.config.trainer.pretrained_checkpoint:
+            pretrained_checkpoint = self.config.trainer.pretrained_checkpoint
+            reload_modules = (
+                self.config.trainer.reload_modules if hasattr(self.config.trainer, "reload_modules") else None
+            )
+            
+            # 如果 action_model 有预加载的权重，且 reload_modules 未指定，则自动排除 action_model
+            if action_model_ckpt_path and not reload_modules:
+                # 检查 checkpoint 是否包含 action_model 的权重
+                try:
+                    checkpoint = torch.load(pretrained_checkpoint, map_location="cpu")
+                    has_action_model_keys = any(k.startswith("action_model.") for k in checkpoint.keys())
+                    if has_action_model_keys:
+                        logger.warning(
+                            f"⚠️  pretrained_checkpoint contains action_model weights, but action_model "
+                            f"was already loaded from {action_model_ckpt_path}. "
+                            f"Will reload action_model from {action_model_ckpt_path} after loading checkpoint."
+                        )
+                except Exception:
+                    pass  # 如果无法读取 checkpoint，继续正常流程
+            
+            self.model = self.load_pretrained_backbones(self.model, pretrained_checkpoint, reload_modules=reload_modules)
+            
+            # 如果 action_model 有预加载的权重，重新加载以确保不被覆盖
+            if action_model_ckpt_path:
+                logger.info(f"🔄 Reloading action_model from {action_model_ckpt_path} to ensure correct weights")
+                self.model.action_model.load_state_dict(
+                    torch.load(action_model_ckpt_path, map_location="cpu"), strict=True
+                )
+                # 验证权重是否被正确恢复
+                action_model_state_after = self.model.action_model.state_dict()
+                mismatched = []
+                for k in action_model_state_before.keys():
+                    if not torch.equal(action_model_state_before[k], action_model_state_after[k]):
+                        mismatched.append(k)
+                if mismatched:
+                    logger.error(f"❌ action_model weights mismatch after reload: {mismatched}")
+                else:
+                    logger.info("✅ action_model weights verified after checkpoint loading")
+
+        #  print model trainable parameters:
+        self.print_trainable_parameters(self.model)
+
+        # build optimizer and scheduler AFTER freezing (critical for DeepSpeed ZeRO)
+        self.optimizer, self.lr_scheduler = setup_optimizer_and_scheduler(model=self.model, cfg=self.config)
+
+        # initialize distributed training components
+        # 注意：不传入 lr_scheduler，避免被 AcceleratedScheduler 包装（会导致 step 被调用 num_processes 倍）
+        self.model, self.optimizer, self.vla_train_dataloader = self.setup_distributed_training(
+            self.accelerator,  # must be the first param
+            self.model,
+            self.optimizer,
+            self.vla_train_dataloader,
+        )
+
+        self._init_wandb()
+        self._init_checkpointing()
+
+    def _calculate_total_batch_size(self):
+        """calculate global batch size"""
+        return (
+            self.config.datasets.vla_data.per_device_batch_size
+            * self.accelerator.num_processes
+            * self.accelerator.gradient_accumulation_steps
+        )
+
+    def _init_wandb(self):
+        """initialize Weights & Biases"""
+        if self.accelerator.is_main_process:
+            wandb.init(
+                name=self.config.run_id,
+                dir=os.path.join(self.config.output_dir, "wandb"),
+                project=self.config.wandb_project,
+                entity=self.config.wandb_entity,
+                group="vla-train",
+                settings=wandb.Settings(
+                    _disable_stats=False,  # 确保启用系统监控
+                    x_stats_sampling_interval=10.0,  # 每10秒采样一次系统指标
+                ),
+            )
+
+    def _init_checkpointing(self):
+        """initialize checkpoint directory"""
+        self.checkpoint_dir = os.path.join(self.config.output_dir, "checkpoints")
+        os.makedirs(self.checkpoint_dir, exist_ok=True)
+
+        pretrained_checkpoint = getattr(self.config.trainer, "pretrained_checkpoint", None)
+        is_resume = getattr(self.config.trainer, "is_resume", False)
+
+        # resume train ckpt
+        if pretrained_checkpoint and is_resume:
+            self._load_checkpoint(self.config.resume_from_checkpoint)
+
+    def _load_checkpoint(self, checkpoint_path):
+        """load checkpoint"""
+        self.accelerator.load_state(checkpoint_path)
+        self.accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
+
+    def _save_checkpoint(self):
+        """save current training state"""
+
+        if self.accelerator.is_main_process:
+
+            checkpoint_path = os.path.join(self.checkpoint_dir, f"steps_{self.completed_steps}")
+            # save model state
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, checkpoint_path + "_pytorch_model.pt")
+
+            # save training metadata
+            summary_data = {
+                "steps": self.completed_steps,
+            }
+            with open(os.path.join(self.config.output_dir, "summary.jsonl"), "a") as f:
+                f.write(json.dumps(summary_data) + "\n")
+            self.accelerator.print(f"✅ Checkpoint saved at {checkpoint_path}")
+            
+            # 删除旧的checkpoint，只保留最近的N个
+            max_checkpoints = getattr(self.config.trainer, "max_checkpoints_to_keep", None)
+            if max_checkpoints is not None and max_checkpoints > 0:
+                self._cleanup_old_checkpoints(max_checkpoints)
+                
+        self.accelerator.wait_for_everyone()
+    
+    def _cleanup_old_checkpoints(self, max_checkpoints: int):
+        """删除旧的checkpoint，只保留最近的N个"""
+        # 只在主进程中执行，避免多进程竞态条件
+        if not self.accelerator.is_main_process:
+            return
+        
+        # 获取所有checkpoint文件
+        checkpoint_pattern = os.path.join(self.checkpoint_dir, "steps_*_pytorch_model.pt")
+        checkpoint_files = glob.glob(checkpoint_pattern)
+        
+        if len(checkpoint_files) <= max_checkpoints:
+            return
+        
+        # 从文件名中提取步数，并按步数排序
+        def extract_steps(filepath):
+            match = re.search(r'steps_(\d+)_pytorch_model\.pt', filepath)
+            return int(match.group(1)) if match else 0
+        
+        checkpoint_files.sort(key=extract_steps)
+        
+        # 删除最旧的checkpoint
+        files_to_delete = checkpoint_files[:-max_checkpoints]
+        for filepath in files_to_delete:
+            try:
+                os.remove(filepath)
+                self.accelerator.print(f"🗑️  Deleted old checkpoint: {os.path.basename(filepath)}")
+            except Exception as e:
+                self.accelerator.print(f"⚠️  Failed to delete checkpoint {filepath}: {e}")
+
+    def _log_metrics(self, metrics):
+        """record training metrics"""
+        if self.completed_steps % self.config.trainer.logging_frequency == 0:
+            # Average grad_norm over the logging window (cleared every emit).
+            if self._grad_norm_buffer:
+                metrics["grad_norm_pre_clip_avg"] = float(
+                    sum(self._grad_norm_buffer) / len(self._grad_norm_buffer)
+                )
+                self._grad_norm_buffer.clear()
+            if dist.get_rank() == 0:
+                # add learning rate
+                metrics["learning_rate"] = self.lr_scheduler.get_last_lr()[0] # see lr group in yaml.trainer.learning_rate
+
+                # add epoch info
+                metrics["epoch"] = round(self.completed_steps / len(self.vla_train_dataloader), 2)
+
+                # record to W&B
+                wandb.log(metrics, step=self.completed_steps)
+                # debug output
+                gn_str = f"{metrics['grad_norm_pre_clip']:.4f}" if "grad_norm_pre_clip" in metrics else "N/A"
+                gn_avg_str = f"{metrics['grad_norm_pre_clip_avg']:.4f}" if "grad_norm_pre_clip_avg" in metrics else "N/A"
+                logger.info(
+                    f"\nStep {self.completed_steps} | "
+                    f"grad_norm_pre_clip={gn_str} | grad_norm_pre_clip_avg={gn_avg_str} | "
+                    f"Metrics: {metrics}"
+                )
+
+    def _create_data_iterators(self):
+        """create data iterators"""
+        self.vla_iter = iter(self.vla_train_dataloader)
+        # self.vlm_iter = iter(self.vlm_train_dataloader)
+
+    def _get_next_batch(self):
+        """get next batch (automatically handle data loop)"""
+        try:
+            batch_vla = next(self.vla_iter)
+        except StopIteration:
+            if not hasattr(self, "vla_epoch_count"):
+                self.vla_epoch_count = 0
+            self.vla_iter, self.vla_epoch_count = TrainerUtils._reset_dataloader(
+                self.vla_train_dataloader, self.vla_epoch_count
+            )
+            batch_vla = next(self.vla_iter)
+
+        return batch_vla
+    
+    def train(self):
+        """execute training loop"""
+        # print training config
+        self._log_training_config()
+
+        # prepare data iterators
+        self._create_data_iterators()
+
+        # create progress bar
+        progress_bar = tqdm(
+            range(self.config.trainer.max_train_steps), disable=not self.accelerator.is_local_main_process
+        )
+
+        # main training loop
+        while self.completed_steps < self.config.trainer.max_train_steps:
+            # get data batch
+            t_start_data = time.perf_counter()
+            batch_vla = self._get_next_batch()
+            self._debug_print_first_batch(batch_vla)
+            t_end_data = time.perf_counter()
+
+            # execute training step
+            t_start_model = time.perf_counter()
+            step_metrics = self._train_step(batch_vla)
+            t_end_model = time.perf_counter()
+
+            # update progress
+            if self.accelerator.sync_gradients:
+                progress_bar.update(1)
+                self.completed_steps += 1
+            
+            if self.accelerator.is_local_main_process:
+                progress_bar.set_postfix(
+                        {
+                            "data_times": f"{t_end_data - t_start_data:.3f}",
+                            "model_times": f"{t_end_model - t_start_model:.3f}",
+                        }
+                    )
+
+            # evaluate model (reuse current training batch to avoid consuming extra samples)
+            if self.completed_steps % self.config.trainer.eval_interval == 0:
+                step_metrics = self.eval_action_model(step_metrics)
+
+            # record metrics
+            step_metrics["data_time"] = t_end_data - t_start_data
+            step_metrics["model_time"] = t_end_model - t_start_model
+            self._log_metrics(step_metrics)
+
+            # save checkpoint
+            if self.completed_steps % self.config.trainer.save_interval == 0 and self.completed_steps > 0:
+                self._save_checkpoint()
+
+            # check termination condition
+            if self.completed_steps >= self.config.trainer.max_train_steps:
+                break
+
+        # training end processing
+        self._finalize_training()
+
+        # execute evaluation step
+
+    def eval_action_model(self, step_metrics: dict = None, examples=None) -> float:
+        """
+        Evaluate the model on the given dataset using the specified metric function.
+
+        :param eval_dataset: List of evaluation samples, each containing 'image', 'instruction', and 'action'.
+        :param metric_fn: Function to compute the distance between predicted and ground truth actions.
+        :return: Average metric score across the evaluation dataset.
+        """
+
+        if examples is None:
+            examples = self._get_next_batch()
+        score = 0.0
+        # When using history, actions contain both history and future
+        # We only evaluate on the future part (predicted actions)
+        if self.model.num_history_steps > 0:
+            start = self.model.num_history_steps
+            end = start + self.model.chunk_size
+            actions = [example["action"][start:end] for example in examples]  # label aligned with predicted future chunk
+        else:
+            actions = [example["action"][: self.model.chunk_size] for example in examples]  # label aligned with prediction length
+        # Predict actions using the model
+        output_dict = self.model.predict_action(examples=examples)
+
+        if self.accelerator.is_main_process:
+            normalized_actions = output_dict["normalized_actions"]  # B, T, D
+            actions = np.array(actions)  # convert actions to numpy.ndarray
+            # B, Chunk, dim = actions.shape
+            num_pots = np.prod(actions.shape)
+            # Compute the metric score (L1 = MAE, 更直观)
+            score = TrainerUtils.l1_distance(normalized_actions, actions)
+            average_score = score / num_pots
+            step_metrics["mae_score"] = average_score
+
+        del examples
+        dist.barrier()  # ensure all processes are synchronized
+        return step_metrics
+
+    def _log_training_config(self):
+        """record training config"""
+        if self.accelerator.is_main_process:
+            logger.info("***** Training Configuration *****")
+            logger.info(f"  Total optimization steps = {self.config.trainer.max_train_steps}")
+            logger.info(f"  Per device batch size = {self.config.datasets.vla_data.per_device_batch_size}")
+            logger.info(f"  Gradient accumulation steps = {self.config.trainer.gradient_accumulation_steps}")
+            logger.info(f"  Total batch size = {self.total_batch_size}")
+            
+            logger.info("***** LR Scheduler Debug Info *****")
+            logger.info(f"  lr_scheduler type = {type(self.lr_scheduler)}")
+            base_scheduler = getattr(self.lr_scheduler, 'scheduler', self.lr_scheduler)
+            logger.info(f"  base_scheduler type = {type(base_scheduler)}")
+            logger.info(f"  initial last_epoch = {getattr(base_scheduler, 'last_epoch', 'N/A')}")
+            logger.info(f"  initial lr = {self.lr_scheduler.get_last_lr()}")
+            logger.info(f"  num_warmup_steps = {self.config.trainer.num_warmup_steps}")
+            logger.info(f"  num_stable_steps = {self.config.trainer.get('num_stable_steps', 0)}")
+            logger.info(f"  max_train_steps = {self.config.trainer.max_train_steps}")
+            logger.info(f"  accelerator.num_processes = {self.accelerator.num_processes}")
+            logger.info(f"  accelerator.gradient_accumulation_steps = {self.accelerator.gradient_accumulation_steps}")
+            logger.info(f"  trainer.mode = {self.training_mode}")
+            logger.info(f"  loss_weights_decay_steps = {self.loss_weights_decay_steps}")
+
+    def _get_aux_loss_decay_weight(self) -> float:
+        if self.training_mode != "decay_aux_loss":
+            return 1.0
+        progress = min(float(self.completed_steps) / float(self.loss_weights_decay_steps), 1.0)
+        return 1.0 - progress
+
+    @staticmethod
+    def _total_grad_norm_l2_local(parameters) -> float:
+        """L2 norm over all grads (same recipe as torch.nn.utils.clip_grad_norm_). DeepSpeed-safe fallback when clip_grad_norm_ returns None."""
+        total_sq = 0.0
+        for p in parameters:
+            if p.grad is None:
+                continue
+            # float32 for stable norm under bf16/fp16 grads
+            param_norm = p.grad.detach().float().norm(2)
+            total_sq += float(param_norm) ** 2
+        return total_sq ** 0.5
+
+    @staticmethod
+    def _grad_norm_scalar(value) -> float:
+        if value is None:
+            return float("nan")
+        if isinstance(value, torch.Tensor):
+            return float(value.detach().item())
+        return float(value)
+
+    def _train_step(self, batch_vla, batch_vlm=None):
+        """execute single training step"""
+        is_deepspeed = self.accelerator.distributed_type == DistributedType.DEEPSPEED
+        grad_norm_pre_clip = None
+
+        with self.accelerator.accumulate(self.model):
+            self.optimizer.zero_grad()
+
+            # VLA task forward propagation（传入 training_step 使各 rank 的 history 随机一致，避免不同步）
+            with torch.autocast("cuda", dtype=torch.bfloat16):
+                output_dict = self.model.forward(batch_vla, training_step=self.completed_steps)
+
+                align_loss = output_dict["align_loss"]
+                recon_loss = output_dict["recon_loss"]
+                predict_loss = output_dict["predict_loss"]
+                aux_loss_decay_weight = self._get_aux_loss_decay_weight()
+
+                if align_loss is not None and recon_loss is not None:
+                    total_loss = (
+                        self.config.trainer.loss_scale.align_loss * aux_loss_decay_weight * align_loss
+                        + self.config.trainer.loss_scale.recon_loss * aux_loss_decay_weight * recon_loss
+                        + predict_loss
+                    )
+                else:
+                    total_loss = predict_loss
+
+            # VLA backward propagation
+            self.accelerator.backward(total_loss)
+
+            # For non-DeepSpeed: clip explicitly and capture pre-clip norm before optimizer.step().
+            # For DeepSpeed: gradient clipping is handled by ds_config internally; calling
+            # clip_grad_norm_ here returns the *previous* step's norm (stored in engine._global_grad_norm
+            # which is only updated during optimizer.step()), so we skip it here and retrieve
+            # the norm after step() below.
+            if not is_deepspeed:
+                gc = getattr(self.config.trainer, "gradient_clipping", None)
+                max_norm = float(gc) if gc is not None else float("inf")
+                grad_norm_pre_clip = self.accelerator.clip_grad_norm_(
+                    self.model.parameters(), max_norm
+                )
+                if grad_norm_pre_clip is None:
+                    grad_norm_pre_clip = self._total_grad_norm_l2_local(self.model.parameters())
+
+            self.optimizer.step()
+
+        if self.accelerator.sync_gradients:
+            self.lr_scheduler.step()
+
+            # For DeepSpeed: gradient clipping is handled internally during optimizer.step(),
+            # which also populates engine._global_grad_norm.  Calling clip_grad_norm_(inf)
+            # is a no-op for DeepSpeed and returns None, so we read _global_grad_norm directly.
+            if is_deepspeed:
+                gn = getattr(self.model, "_global_grad_norm", None)
+                if gn is None:
+                    # Older DeepSpeed / different ZeRO stage: try accelerator fallback
+                    gn = self.accelerator.clip_grad_norm_(self.model.parameters(), float("inf"))
+                grad_norm_pre_clip = gn
+
+        gn_scalar = self._grad_norm_scalar(grad_norm_pre_clip)
+        self._grad_norm_buffer.append(gn_scalar)
+        step_metrics = {
+            "align_loss": align_loss.item() if align_loss is not None else None,
+            "recon_loss": recon_loss.item() if recon_loss is not None else None,
+            "predict_loss": predict_loss.item(),
+            "aux_loss_decay_weight": aux_loss_decay_weight,
+            "grad_norm_pre_clip": gn_scalar,
+        }
+        return step_metrics
+
+    def _finalize_training(self):
+        """training end processing"""
+        # save final model
+        if self.accelerator.is_main_process:
+            final_checkpoint = os.path.join(self.config.output_dir, "final_model")
+            os.makedirs(final_checkpoint, exist_ok=True)
+            state_dict = self.accelerator.get_state_dict(self.model)
+            torch.save(state_dict, os.path.join(final_checkpoint, "pytorch_model.pt"))
+            logger.info(f"Training complete. Final model saved at {final_checkpoint}")
+
+        # close W&B
+        if self.accelerator.is_main_process:
+            wandb.finish()
+
+        self.accelerator.wait_for_everyone()
+
+
+def main(cfg) -> None:
+    logger.info("VLA Training :: Warming Up")
+
+    # create output directory and save config
+    output_dir = setup_directories(cfg=cfg)
+    # build model
+    vla = build_framework(cfg)
+    # prepare data
+    vla_train_dataloader = prepare_data(cfg=cfg, accelerator=accelerator, output_dir=output_dir)
+
+    # create trainer
+    # Run VLA Training
+    trainer = VLATrainer(
+        cfg=cfg,
+        model=vla,
+        vla_train_dataloader=vla_train_dataloader,
+        optimizer=None,
+        lr_scheduler=None,
+        accelerator=accelerator,
+    )
+
+    # execute training preparation
+    trainer.prepare_training()
+    # execute training
+    trainer.train()
+
+    # And... we're done!
+    logger.info("... and that's all, folks!")
+    dist.barrier()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config_yaml", type=str, default="starVLA/config/training/starvla_cotrain_oxe.yaml", help="Path to YAML config")
+    args, clipargs = parser.parse_known_args()
+
+    # Load YAML config & Convert CLI overrides to dotlist config
+    cfg = OmegaConf.load(args.config_yaml)
+    dotlist = normalize_dotlist_args(clipargs)  # Normalize CLI args to dotlist format
+    cli_cfg = OmegaConf.from_dotlist(dotlist)
+    cfg = OmegaConf.merge(cfg, cli_cfg)
+
+    # if cfg.is_debug:
+    if cfg.is_debug and dist.is_initialized() and dist.get_rank() == 0:
+        import debugpy
+        debugpy.listen(("0.0.0.0", 10092))
+        print("🔍 Rank 0 waiting for debugger attach on port 10092...")
+        debugpy.wait_for_client()
+
+    main(cfg)
diff --git a/code/training/train_qwenpi.py b/code/training/train_qwenpi.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bce5ac2f5543775ada5b15a694e05f33e7a86a7
--- /dev/null
+++ b/code/training/train_qwenpi.py
@@ -0,0 +1,115 @@
+import argparse
+import torch
+import numpy as np
+import torch.distributed as dist
+from omegaconf import OmegaConf
+
+from starVLA.training.trainer_utils.trainer_tools import normalize_dotlist_args, TrainerUtils
+from starVLA.model.framework import build_framework
+from starVLA.training.train_qwenlatent import (
+    accelerator,
+    logger,
+    setup_directories,
+    prepare_data,
+    VLATrainer,
+)
+
+
+class QwenPITrainer(VLATrainer):
+    def _train_step(self, batch_vla, batch_vlm=None):
+        """Execute one training step for QwenPI (single `action_loss`)."""
+        with self.accelerator.accumulate(self.model):
+            self.optimizer.zero_grad()
+
+            # QwenPI.forward() manages autocast internally (bfloat16 for VLM, float32 for action model);
+            # do NOT wrap again here to avoid interfering with internal precision management.
+            output_dict = self.model.forward(batch_vla)
+            action_loss = output_dict["action_loss"]
+            total_loss = action_loss
+
+            self.accelerator.backward(total_loss)
+
+            grad_norm = None
+            if self.config.trainer.gradient_clipping is not None:
+                grad_norm = self.accelerator.clip_grad_norm_(
+                    self.model.parameters(), self.config.trainer.gradient_clipping
+                )
+
+            self.optimizer.step()
+
+        if self.accelerator.sync_gradients:
+            self.lr_scheduler.step()
+
+        step_metrics = {"action_loss": action_loss.item()}
+        if grad_norm is not None:
+            step_metrics["grad_norm"] = grad_norm.item() if hasattr(grad_norm, "item") else float(grad_norm)
+        return step_metrics
+
+    def eval_action_model(self, step_metrics: dict = None, examples=None) -> float:
+        """
+        Evaluate MAE for QwenPI using predicted horizon length.
+        """
+        if examples is None:
+            examples = self._get_next_batch()
+
+        output_dict = self.model.predict_action(examples=examples)
+
+        if self.accelerator.is_main_process:
+            normalized_actions = output_dict["normalized_actions"]  # [B, T_pred, D]
+            pred_horizon = normalized_actions.shape[1]
+
+            # QwenPI forward trains on the last future window (`[-pred_horizon:]`)
+            actions = [example["action"][-pred_horizon:] for example in examples]
+            actions = np.array(actions)
+
+            num_points = np.prod(actions.shape)
+            score = TrainerUtils.l1_distance(normalized_actions, actions)
+            average_score = score / num_points
+            step_metrics["mae_score"] = average_score
+
+        del examples
+        if dist.is_initialized():
+            dist.barrier()
+        return step_metrics
+
+
+def main(cfg) -> None:
+    logger.info("QwenPI Training :: Warming Up")
+
+    output_dir = setup_directories(cfg=cfg)
+    vla = build_framework(cfg)
+    vla_train_dataloader = prepare_data(cfg=cfg, accelerator=accelerator, output_dir=output_dir)
+
+    trainer = QwenPITrainer(
+        cfg=cfg,
+        model=vla,
+        vla_train_dataloader=vla_train_dataloader,
+        optimizer=None,
+        lr_scheduler=None,
+        accelerator=accelerator,
+    )
+    trainer.prepare_training()
+    trainer.train()
+
+    logger.info("QwenPI training finished.")
+    if dist.is_initialized():
+        dist.barrier()
+        dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--config_yaml",
+        type=str,
+        default="starVLA/config/training/starvla_train_qwenpi.yaml",
+        help="Path to YAML config",
+    )
+    args, clipargs = parser.parse_known_args()
+
+    cfg = OmegaConf.load(args.config_yaml)
+    dotlist = normalize_dotlist_args(clipargs)
+    cli_cfg = OmegaConf.from_dotlist(dotlist)
+    cfg = OmegaConf.merge(cfg, cli_cfg)
+
+    main(cfg)
diff --git a/code/training/trainer_utils/__init__.py b/code/training/trainer_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6897a047fc2741f7e434bcdaa78f6a14c473fec9
--- /dev/null
+++ b/code/training/trainer_utils/__init__.py
@@ -0,0 +1 @@
+from .overwatch import initialize_overwatch
diff --git a/code/training/trainer_utils/__pycache__/__init__.cpython-310.pyc b/code/training/trainer_utils/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99d7b961a34d2c9f6924a6443123bb4ec11556d3
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/__init__.cpython-310.pyc differ
diff --git a/code/training/trainer_utils/__pycache__/__init__.cpython-311.pyc b/code/training/trainer_utils/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e7da500ef87e1f69de7885083da9574373ce9f
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/__init__.cpython-311.pyc differ
diff --git a/code/training/trainer_utils/__pycache__/overwatch.cpython-310.pyc b/code/training/trainer_utils/__pycache__/overwatch.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b89c59b2114d565d35de3070f28e7568022768a
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/overwatch.cpython-310.pyc differ
diff --git a/code/training/trainer_utils/__pycache__/overwatch.cpython-311.pyc b/code/training/trainer_utils/__pycache__/overwatch.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75e909dc89211b51f2332cd0ceaf34e0a1a01635
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/overwatch.cpython-311.pyc differ
diff --git a/code/training/trainer_utils/__pycache__/trainer_tools.cpython-310.pyc b/code/training/trainer_utils/__pycache__/trainer_tools.cpython-310.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8f37ce85e305c310a7c478196dd0258dca5f519
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/trainer_tools.cpython-310.pyc differ
diff --git a/code/training/trainer_utils/__pycache__/trainer_tools.cpython-311.pyc b/code/training/trainer_utils/__pycache__/trainer_tools.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e961773aff856550d7568eeb5632fe2e8b72e24b
Binary files /dev/null and b/code/training/trainer_utils/__pycache__/trainer_tools.cpython-311.pyc differ
diff --git a/code/training/trainer_utils/config_tracker.py b/code/training/trainer_utils/config_tracker.py
new file mode 100644
index 0000000000000000000000000000000000000000..165662cc4029912ff34bed16ff217361a1f8f4ac
--- /dev/null
+++ b/code/training/trainer_utils/config_tracker.py
@@ -0,0 +1,549 @@
+from omegaconf import OmegaConf, DictConfig, ListConfig
+from typing import Set, Any, Optional, Union
+import json
+from pathlib import Path
+
+
+class AccessTrackedConfig:
+    """
+    Wrapper for OmegaConf to track accessed parameters.
+    Only saves configuration items that were actually accessed during execution.
+    """
+    
+    _original_cfg_snapshot: Optional[OmegaConf] = None
+    
+    def __init__(self, cfg: Union[DictConfig, ListConfig], parent: 'AccessTrackedConfig' = None, key_path: str = ""):
+        object.__setattr__(self, '_cfg', cfg)
+        object.__setattr__(self, '_parent', parent)
+        object.__setattr__(self, '_key_path', key_path)
+        object.__setattr__(self, '_local_accessed', set())
+        object.__setattr__(self, '_children', {})
+        
+        if parent is None:
+            AccessTrackedConfig._original_cfg_snapshot = OmegaConf.create(
+                OmegaConf.to_container(cfg, resolve=True)
+            )
+    
+    def _is_list_config(self) -> bool:
+        """Check if underlying config is a ListConfig"""
+        return isinstance(self._cfg, ListConfig)
+    
+    def _is_dict_config(self) -> bool:
+        """Check if underlying config is a DictConfig"""
+        return isinstance(self._cfg, DictConfig)
+    
+    def __getattr__(self, name: str) -> Any:
+        if name.startswith('_'):
+            return object.__getattribute__(self, name)
+        
+        self._local_accessed.add(name)
+        # Use safe access: for hasattr() semantics, raise AttributeError on missing keys
+        try:
+            value = self._cfg[name]
+        except Exception:
+            raise AttributeError(f"Config has no attribute '{name}'")
+        
+        if OmegaConf.is_config(value):
+            new_path = f"{self._key_path}.{name}" if self._key_path else name
+            if name not in self._children:
+                self._children[name] = AccessTrackedConfig(value, parent=self, key_path=new_path)
+            return self._children[name]
+        
+        return value
+    
+    def __getitem__(self, key) -> Any:
+        """Support both dict-style and list-style access"""
+        if isinstance(key, int):
+            # List-style access
+            self._local_accessed.add(f"[{key}]")
+            value = self._cfg[key]
+            if OmegaConf.is_config(value):
+                new_path = f"{self._key_path}[{key}]" if self._key_path else f"[{key}]"
+                cache_key = f"[{key}]"
+                if cache_key not in self._children:
+                    self._children[cache_key] = AccessTrackedConfig(value, parent=self, key_path=new_path)
+                return self._children[cache_key]
+            return value
+        else:
+            # Dict-style access
+            return self.__getattr__(key)
+    
+    def __setattr__(self, name: str, value: Any):
+        if name.startswith('_'):
+            object.__setattr__(self, name, value)
+        else:
+            self._local_accessed.add(name)
+            self._cfg[name] = value
+            # Invalidate child cache if exists
+            if name in self._children:
+                del self._children[name]
+
+    def __setitem__(self, key, value: Any):
+        """Support both dict-style and list-style setting"""
+        if isinstance(key, int):
+            self._local_accessed.add(f"[{key}]")
+            self._cfg[key] = value
+            cache_key = f"[{key}]"
+            if cache_key in self._children:
+                del self._children[cache_key]
+        else:
+            self._local_accessed.add(key)
+            self._cfg[key] = value
+            if key in self._children:
+                del self._children[key]
+    
+    def __contains__(self, key) -> bool:
+        """Support 'in' operator - tracks the key check as an access"""
+        if isinstance(key, int):
+            self._local_accessed.add(f"[{key}]")
+        else:
+            self._local_accessed.add(key)
+        return key in self._cfg
+    
+    def __len__(self) -> int:
+        """Return number of keys/items"""
+        return len(self._cfg)
+    
+    def __iter__(self):
+        """Support iteration for both DictConfig and ListConfig"""
+        if self._is_list_config():
+            # For ListConfig, iterate over indices and return values
+            for i in range(len(self._cfg)):
+                self._local_accessed.add(f"[{i}]")
+            return iter(self._cfg)
+        else:
+            # For DictConfig, iterate over keys
+            for key in self._cfg.keys():
+                self._local_accessed.add(key)
+            return iter(self._cfg)
+    
+    def __repr__(self) -> str:
+        """String representation"""
+        if self._is_list_config():
+            return f"AccessTrackedConfig({self._key_path or 'root'}, list_len={len(self._cfg)})"
+        return f"AccessTrackedConfig({self._key_path or 'root'}, keys={list(self._cfg.keys())})"
+    
+    def __str__(self) -> str:
+        """String representation"""
+        return OmegaConf.to_yaml(self._cfg)
+    
+    def __bool__(self) -> bool:
+        """Boolean evaluation - True if config has any keys/items"""
+        return len(self._cfg) > 0
+    
+    def __eq__(self, other) -> bool:
+        """Equality comparison"""
+        if isinstance(other, AccessTrackedConfig):
+            return self._cfg == other._cfg
+        elif OmegaConf.is_config(other):
+            return self._cfg == other
+        elif isinstance(other, (dict, list)):
+            return OmegaConf.to_container(self._cfg, resolve=True) == other
+        return False
+    
+    def keys(self):
+        """Return config keys (required for dict unpacking)
+        Tracks all keys as accessed. Only works for DictConfig.
+        """
+        if self._is_list_config():
+            raise TypeError("ListConfig does not support keys()")
+        for key in self._cfg.keys():
+            self._local_accessed.add(key)
+        return self._cfg.keys()
+    
+    def values(self):
+        """Return config values (tracks all keys as accessed)"""
+        if self._is_list_config():
+            for i in range(len(self._cfg)):
+                self._local_accessed.add(f"[{i}]")
+                yield self[i]
+        else:
+            for key in self._cfg.keys():
+                self._local_accessed.add(key)
+                yield self.get(key)
+    
+    def items(self):
+        """Return config items (tracks all keys as accessed)"""
+        if self._is_list_config():
+            raise TypeError("ListConfig does not support items()")
+        for key in self._cfg.keys():
+            self._local_accessed.add(key)
+            yield key, self.get(key)
+    
+    def get(self, key: str, default: Any = None) -> Any:
+        """Get value with default fallback"""
+        self._local_accessed.add(key)
+        value = self._cfg.get(key, default)
+        
+        if value is not default and OmegaConf.is_config(value):
+            new_path = f"{self._key_path}.{key}" if self._key_path else key
+            if key not in self._children:
+                self._children[key] = AccessTrackedConfig(value, parent=self, key_path=new_path)
+            return self._children[key]
+        
+        return value
+    
+    def update(self, other: Any = None, **kwargs):
+        """Update config with values from another dict/config"""
+        if self._is_list_config():
+            raise TypeError("ListConfig does not support update()")
+        
+        if other is not None:
+            # Handle different input types
+            if isinstance(other, AccessTrackedConfig):
+                other = OmegaConf.to_container(other._cfg, resolve=True)
+            elif OmegaConf.is_config(other):
+                other = OmegaConf.to_container(other, resolve=True)
+            elif hasattr(other, 'items'):
+                # Dict-like object
+                other = dict(other.items())
+            elif hasattr(other, '__iter__'):
+                # Iterable of key-value pairs
+                other = dict(other)
+            else:
+                raise TypeError(f"Cannot update from {type(other)}")
+            
+            for key, value in other.items():
+                self._local_accessed.add(key)
+                self._cfg[key] = value
+                # Invalidate child cache if exists
+                if key in self._children:
+                    del self._children[key]
+        
+        for key, value in kwargs.items():
+            self._local_accessed.add(key)
+            self._cfg[key] = value
+            if key in self._children:
+                del self._children[key]
+    
+    def pop(self, key, *args):
+        """Remove and return a value"""
+        if isinstance(key, int):
+            self._local_accessed.add(f"[{key}]")
+            cache_key = f"[{key}]"
+        else:
+            self._local_accessed.add(key)
+            cache_key = key
+        
+        if cache_key in self._children:
+            del self._children[cache_key]
+        if args:
+            return self._cfg.pop(key, args[0])
+        return self._cfg.pop(key)
+    
+    def append(self, value: Any):
+        """Append value to list (only for ListConfig)"""
+        if not self._is_list_config():
+            raise TypeError("append() only supported for ListConfig")
+        self._cfg.append(value)
+        idx = len(self._cfg) - 1
+        self._local_accessed.add(f"[{idx}]")
+    
+    def extend(self, values):
+        """Extend list with values (only for ListConfig)"""
+        if not self._is_list_config():
+            raise TypeError("extend() only supported for ListConfig")
+        start_idx = len(self._cfg)
+        self._cfg.extend(values)
+        for i in range(start_idx, len(self._cfg)):
+            self._local_accessed.add(f"[{i}]")
+    
+    def setdefault(self, key: str, default: Any = None) -> Any:
+        """Set default value if key doesn't exist"""
+        if self._is_list_config():
+            raise TypeError("ListConfig does not support setdefault()")
+        self._local_accessed.add(key)
+        if key not in self._cfg:
+            self._cfg[key] = default
+        return self.get(key)
+    
+    def copy(self) -> 'AccessTrackedConfig':
+        """Return a shallow copy (does not copy access tracking state)"""
+        new_cfg = OmegaConf.create(OmegaConf.to_container(self._cfg, resolve=True))
+        return AccessTrackedConfig(new_cfg)
+    
+    def deepcopy(self) -> 'AccessTrackedConfig':
+        """Return a deep copy (does not copy access tracking state)"""
+        new_cfg = OmegaConf.create(OmegaConf.to_container(self._cfg, resolve=True))
+        return AccessTrackedConfig(new_cfg)
+    
+    def merge_with(self, *others) -> 'AccessTrackedConfig':
+        """Merge with other configs and return new tracked config"""
+        configs = [self._cfg]
+        for other in others:
+            if isinstance(other, AccessTrackedConfig):
+                configs.append(other._cfg)
+            elif OmegaConf.is_config(other):
+                configs.append(other)
+            else:
+                configs.append(OmegaConf.create(other))
+        
+        merged = OmegaConf.merge(*configs)
+        return AccessTrackedConfig(merged)
+    
+    def to_dict(self, resolve: bool = True) -> dict:
+        """Convert to plain dictionary or list"""
+        return OmegaConf.to_container(self._cfg, resolve=resolve)
+    
+    def to_yaml(self, resolve: bool = False) -> str:
+        """Convert to YAML string"""
+        return OmegaConf.to_yaml(self._cfg, resolve=resolve)
+    
+    def unwrap(self) -> Union[DictConfig, ListConfig]:
+        """Get the underlying OmegaConf object"""
+        return self._cfg
+    
+    def get_root(self) -> 'AccessTrackedConfig':
+        """Get root config object"""
+        current = self
+        while current._parent is not None:
+            current = current._parent
+        return current
+    
+    def _collect_all_paths(self, node: 'AccessTrackedConfig' = None, prefix: str = "") -> Set[str]:
+        """Recursively collect all accessed paths"""
+        if node is None:
+            node = self.get_root()
+        
+        paths = set()
+        for key in node._local_accessed:
+            current_path = f"{prefix}.{key}" if prefix and not key.startswith('[') else f"{prefix}{key}" if prefix else key
+            paths.add(current_path)
+            if key in node._children:
+                paths.update(self._collect_all_paths(node._children[key], current_path))
+        return paths
+    
+    def _filter_leaf_paths(self, paths: Set[str]) -> Set[str]:
+        """Filter to only leaf paths (no sub-paths)"""
+        if not paths:
+            return set()
+        
+        leaf_paths = set()
+        for path in paths:
+            # Check if any other path starts with this path followed by . or [
+            is_leaf = True
+            for other in paths:
+                if other != path:
+                    if other.startswith(f"{path}.") or other.startswith(f"{path}["):
+                        is_leaf = False
+                        break
+            if is_leaf:
+                leaf_paths.add(path)
+        return leaf_paths
+    
+    @staticmethod
+    def _get_nested_value(cfg, path: str) -> Any:
+        """Get nested value through dot-separated path with bracket notation support"""
+        import re
+        value = cfg
+        # Split by . but keep bracket notation together
+        parts = re.split(r'\.(?![^\[]*\])', path)
+        for part in parts:
+            # Handle bracket notation like [0]
+            bracket_match = re.match(r'\[(\d+)\]', part)
+            if bracket_match:
+                idx = int(bracket_match.group(1))
+                value = value[idx]
+            elif '[' in part:
+                # Handle cases like "key[0]"
+                key_part, rest = part.split('[', 1)
+                if key_part:
+                    value = value[key_part]
+                indices = re.findall(r'\[(\d+)\]', '[' + rest)
+                for idx_str in indices:
+                    value = value[int(idx_str)]
+            else:
+                value = value[part]
+        
+        return OmegaConf.to_container(value, resolve=True) if OmegaConf.is_config(value) else value
+    
+    @staticmethod
+    def _set_nested_value(d: dict, path: str, value: Any):
+        """Set nested value through dot-separated path"""
+        import re
+        parts = re.split(r'\.(?![^\[]*\])', path)
+        
+        for i, part in enumerate(parts[:-1]):
+            bracket_match = re.match(r'\[(\d+)\]', part)
+            if bracket_match:
+                idx = int(bracket_match.group(1))
+                while len(d) <= idx:
+                    d.append({})
+                d = d[idx]
+            elif '[' in part:
+                key_part, rest = part.split('[', 1)
+                if key_part:
+                    d = d.setdefault(key_part, {})
+                indices = re.findall(r'\[(\d+)\]', '[' + rest)
+                for idx_str in indices:
+                    idx = int(idx_str)
+                    if isinstance(d, list):
+                        while len(d) <= idx:
+                            d.append({})
+                        d = d[idx]
+                    else:
+                        d = d.setdefault(idx, {})
+            else:
+                d = d.setdefault(part, {})
+        
+        # Set final value
+        last_part = parts[-1]
+        bracket_match = re.match(r'\[(\d+)\]', last_part)
+        if bracket_match:
+            idx = int(bracket_match.group(1))
+            while len(d) <= idx:
+                d.append(None)
+            d[idx] = value
+        elif '[' in last_part:
+            key_part, rest = last_part.split('[', 1)
+            if key_part:
+                d = d.setdefault(key_part, [])
+            indices = re.findall(r'\[(\d+)\]', '[' + rest)
+            for idx_str in indices[:-1]:
+                idx = int(idx_str)
+                while len(d) <= idx:
+                    d.append([])
+                d = d[idx]
+            final_idx = int(indices[-1])
+            while len(d) <= final_idx:
+                d.append(None)
+            d[final_idx] = value
+        else:
+            d[last_part] = value
+    
+    def export_accessed_config(self, use_original_values: bool = True) -> dict:
+        """Export accessed configuration as dictionary (only leaf values)"""
+        all_paths = self._collect_all_paths()
+        leaf_paths = self._filter_leaf_paths(all_paths)
+        source_cfg = AccessTrackedConfig._original_cfg_snapshot if use_original_values else self.get_root()._cfg
+        
+        result = {}
+        for path in sorted(leaf_paths):
+            try:
+                value = self._get_nested_value(source_cfg, path)
+                self._set_nested_value(result, path, value)
+            except Exception:
+                if use_original_values:
+                    try:
+                        value = self._get_nested_value(self.get_root()._cfg, path)
+                        self._set_nested_value(result, path, value)
+                    except Exception:
+                        pass
+        return result
+    
+    def save_accessed_config(self, filepath: Path, use_original_values: bool = True):
+        """Save accessed configuration to file"""
+        accessed_config = self.export_accessed_config(use_original_values=use_original_values)
+        filepath = Path(filepath)
+        
+        filepath.parent.mkdir(parents=True, exist_ok=True)
+        
+        with open(filepath, 'w') as f:
+            if filepath.suffix == '.json':
+                json.dump(accessed_config, f, indent=2)
+            elif filepath.suffix in ('.yaml', '.yml'):
+                OmegaConf.save(OmegaConf.create(accessed_config), f)
+            else:
+                raise ValueError(f"Unsupported file format: {filepath.suffix}")
+    
+    def get_access_summary(self) -> dict:
+        """Get summary of accessed configuration"""
+        all_paths = self._collect_all_paths()
+        leaf_paths = self._filter_leaf_paths(all_paths)
+        
+        return {
+            "total_accessed_keys": len(all_paths),
+            "leaf_accessed_keys": len(leaf_paths),
+            "leaf_accessed_paths": sorted(leaf_paths),
+            "top_level_keys": sorted(self.get_root()._local_accessed)
+        }
+    
+    def print_access_summary(self):
+        """Print a formatted summary of accessed configuration"""
+        summary = self.get_access_summary()
+        print(f"\n{'='*60}")
+        print("Configuration Access Summary")
+        print(f"{'='*60}")
+        print(f"Total accessed keys: {summary['total_accessed_keys']}")
+        print(f"Leaf accessed keys: {summary['leaf_accessed_keys']}")
+        print(f"\nTop-level keys accessed: {summary['top_level_keys']}")
+        print(f"\nLeaf paths accessed:")
+        for path in summary['leaf_accessed_paths']:
+            print(f"  - {path}")
+        print(f"{'='*60}\n")
+
+
+def wrap_config(cfg: OmegaConf) -> AccessTrackedConfig:
+    """Wrap OmegaConf configuration to enable access tracking"""
+    return AccessTrackedConfig(cfg)
+
+
+def unwrap_config(cfg) -> OmegaConf:
+    """Unwrap AccessTrackedConfig to get underlying OmegaConf object"""
+    return cfg.unwrap() if isinstance(cfg, AccessTrackedConfig) else cfg
+
+
+# ========== Monkey Patch OmegaConf for Compatibility ==========
+
+_original_to_container = OmegaConf.to_container
+_original_save = OmegaConf.save
+_original_to_yaml = OmegaConf.to_yaml
+_original_is_config = OmegaConf.is_config
+_original_merge = OmegaConf.merge
+
+
+def _patched_to_container(cfg, resolve=True, enum_to_str=False, structured_config_mode=None):
+    """Patched OmegaConf.to_container that handles AccessTrackedConfig"""
+    if isinstance(cfg, AccessTrackedConfig):
+        cfg = cfg.unwrap()
+    
+    try:
+        if structured_config_mode is not None:
+            return _original_to_container(cfg, resolve=resolve, enum_to_str=enum_to_str, 
+                                         structured_config_mode=structured_config_mode)
+        else:
+            return _original_to_container(cfg, resolve=resolve, enum_to_str=enum_to_str)
+    except TypeError:
+        return _original_to_container(cfg, resolve=resolve)
+
+
+def _patched_save(config, f, resolve=False):
+    """Patched OmegaConf.save that handles AccessTrackedConfig"""
+    if isinstance(config, AccessTrackedConfig):
+        config = config.unwrap()
+    return _original_save(config, f, resolve=resolve)
+
+
+def _patched_to_yaml(cfg, resolve=False, sort_keys=False):
+    """Patched OmegaConf.to_yaml that handles AccessTrackedConfig"""
+    if isinstance(cfg, AccessTrackedConfig):
+        cfg = cfg.unwrap()
+    
+    try:
+        return _original_to_yaml(cfg, resolve=resolve, sort_keys=sort_keys)
+    except TypeError:
+        return _original_to_yaml(cfg, resolve=resolve)
+
+
+def _patched_is_config(obj):
+    """Patched OmegaConf.is_config that handles AccessTrackedConfig"""
+    return True if isinstance(obj, AccessTrackedConfig) else _original_is_config(obj)
+
+
+def _patched_merge(*configs):
+    """Patched OmegaConf.merge that handles AccessTrackedConfig"""
+    unwrapped_configs = []
+    for cfg in configs:
+        if isinstance(cfg, AccessTrackedConfig):
+            unwrapped_configs.append(cfg.unwrap())
+        else:
+            unwrapped_configs.append(cfg)
+    return _original_merge(*unwrapped_configs)
+
+
+# Apply patches
+OmegaConf.to_container = _patched_to_container
+OmegaConf.save = _patched_save
+OmegaConf.to_yaml = _patched_to_yaml
+OmegaConf.is_config = _patched_is_config
+OmegaConf.merge = _patched_merge
\ No newline at end of file
diff --git a/code/training/trainer_utils/overwatch.py b/code/training/trainer_utils/overwatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..b441c16c8725b88896154a02f64bf39e0e1d9d63
--- /dev/null
+++ b/code/training/trainer_utils/overwatch.py
@@ -0,0 +1,149 @@
+"""
+overwatch.py
+# Original file from OpenVLA project (Prismatic), licensed under MIT License.
+# See https://github.com/openvla/openvla for full license text and contributors.
+# Modified by @JinhuiYE, [2025]
+Utility class for creating a centralized/standardized logger (built on Rich) and accelerate handler.
+"""
+
+import logging
+import logging.config
+import os
+from contextlib import nullcontext
+from logging import LoggerAdapter
+from typing import Any, Callable, ClassVar, Dict, MutableMapping, Tuple, Union
+
+# Overwatch Default Format String
+RICH_FORMATTER, DATEFMT = "| >> %(message)s", "%m/%d [%H:%M:%S]"
+
+# Set Logging Configuration
+LOG_CONFIG = {
+    "version": 1,
+    "disable_existing_loggers": True,
+    "formatters": {"simple-console": {"format": RICH_FORMATTER, "datefmt": DATEFMT}},
+    "handlers": {
+        "console": {
+            "class": "rich.logging.RichHandler",
+            "formatter": "simple-console",
+            "markup": True,
+            "rich_tracebacks": True,
+            "show_level": True,
+            "show_path": True,
+            "show_time": True,
+        }
+    },
+    "root": {"level": "INFO", "handlers": ["console"]},
+}
+logging.config.dictConfig(LOG_CONFIG)
+
+
+# === Custom Contextual Logging Logic ===
+class ContextAdapter(LoggerAdapter):
+    CTX_PREFIXES: ClassVar[Dict[int, str]] = {**{0: "[*] "}, **{idx: "|=> ".rjust(4 + (idx * 4)) for idx in [1, 2, 3]}}
+
+    def process(self, msg: str, kwargs: MutableMapping[str, Any]) -> Tuple[str, MutableMapping[str, Any]]:
+        ctx_level = kwargs.pop("ctx_level", 0)
+        return f"{self.CTX_PREFIXES[ctx_level]}{msg}", kwargs
+
+
+class DistributedOverwatch:
+    def __init__(self, name: str) -> None:
+        """Initializer for an Overwatch object that wraps logging & `accelerate.PartialState`."""
+        from accelerate import PartialState
+
+        # Note that PartialState is always safe to initialize regardless of `accelerate launch` or `torchrun`
+        #   =>> However, might be worth actually figuring out if we need the `accelerate` dependency at all!
+        self.logger, self.distributed_state = ContextAdapter(logging.getLogger(name), extra={}), PartialState()
+
+        # Logger Delegation
+        self.debug = self.logger.debug
+        self.info = self.logger.info
+        self.warning = self.logger.warning
+        self.error = self.logger.error
+        self.critical = self.logger.critical
+
+        # Logging Defaults =>> only Log `INFO` on Main Process, `ERROR` on others!
+        self.logger.setLevel(logging.INFO if self.distributed_state.is_main_process else logging.ERROR)
+
+    @property
+    def rank_zero_only(self) -> Callable[..., Any]:
+        return self.distributed_state.on_main_process
+
+    @property
+    def local_zero_only(self) -> Callable[..., Any]:
+        return self.distributed_state.on_local_main_process
+
+    @property
+    def rank_zero_first(self) -> Callable[..., Any]:
+        return self.distributed_state.main_process_first
+
+    @property
+    def local_zero_first(self) -> Callable[..., Any]:
+        return self.distributed_state.local_main_process_first
+
+    def is_rank_zero(self) -> bool:
+        return self.distributed_state.is_main_process
+
+    def rank(self) -> int:
+        return self.distributed_state.process_index
+
+    def local_rank(self) -> int:
+        return self.distributed_state.local_process_index
+
+    def world_size(self) -> int:
+        return self.distributed_state.num_processes
+
+
+class PureOverwatch:
+    def __init__(self, name: str) -> None:
+        """Initializer for an Overwatch object that just wraps logging."""
+        self.logger = ContextAdapter(logging.getLogger(name), extra={})
+
+        # Logger Delegation
+        self.debug = self.logger.debug
+        self.info = self.logger.info
+        self.warning = self.logger.warning
+        self.error = self.logger.error
+        self.critical = self.logger.critical
+
+        # Logging Defaults =>> INFO
+        self.logger.setLevel(logging.INFO)
+
+    @staticmethod
+    def get_identity_ctx() -> Callable[..., Any]:
+        def identity(fn: Callable[..., Any]) -> Callable[..., Any]:
+            return fn
+
+        return identity
+
+    @property
+    def rank_zero_only(self) -> Callable[..., Any]:
+        return self.get_identity_ctx()
+
+    @property
+    def local_zero_only(self) -> Callable[..., Any]:
+        return self.get_identity_ctx()
+
+    @property
+    def rank_zero_first(self) -> Callable[..., Any]:
+        return nullcontext
+
+    @property
+    def local_zero_first(self) -> Callable[..., Any]:
+        return nullcontext
+
+    @staticmethod
+    def is_rank_zero() -> bool:
+        return True
+
+    @staticmethod
+    def rank() -> int:
+        return 0
+
+    @staticmethod
+    def world_size() -> int:
+        return 1
+
+
+def initialize_overwatch(name: str) -> Union[DistributedOverwatch, PureOverwatch]:
+    return DistributedOverwatch(name) if int(os.environ.get("WORLD_SIZE", -1)) != -1 else PureOverwatch(name)
diff --git a/code/training/trainer_utils/trainer_tools.py b/code/training/trainer_utils/trainer_tools.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a113572870790bdce31d3d9c2fbaafa463e6f7b
--- /dev/null
+++ b/code/training/trainer_utils/trainer_tools.py
@@ -0,0 +1,603 @@
+"""
+metrics.py
+
+Utility classes defining a Metrics container and multiple Trackers to enable model/stage-specific logging to various
+endpoints (e.g., JSONL local logs, Weights & Biases).
+"""
+
+from typing import Tuple
+import re
+import json
+import numpy as np
+import torch
+
+from accelerate.logging import get_logger
+
+logger = get_logger(__name__)
+
+
+# === Define Tracker Interface ===
+#
+
+# utils/cli_parser.py
+
+
+def normalize_dotlist_args(args):
+    """
+    Convert ['--x.y', 'val'] and ['--flag'] → ['x.y=val', 'flag=true']
+    """
+    normalized = []
+    skip = False
+    for i in range(len(args)):
+        if skip:
+            skip = False
+            continue
+
+        arg = args[i]
+        if arg.startswith("--"):
+            key = arg.lstrip("-")
+            if "=" in key:
+                normalized.append(key)
+            elif i + 1 < len(args) and not args[i + 1].startswith("--"):
+                normalized.append(f"{key}={args[i + 1]}")
+                skip = True
+            else:
+                normalized.append(f"{key}=true")
+        else:
+            pass  # skip orphaned values
+    return normalized
+
+
+def build_param_lr_groups(model, cfg):
+    """
+    build multiple param groups based on cfg.trainer.learning_rate.
+    support specifying different learning rates for different modules, the rest use base.
+
+    Args:
+        vla: nn.Module model object
+        cfg: config object, requires cfg.trainer.learning_rate dictionary
+
+    Returns:
+        List[Dict]: param_groups that can be used to build optimizer with torch.optim
+    """
+
+    lr_cfg = cfg.trainer.learning_rate
+    base_lr = lr_cfg.get("base", 1e-4)  # default base learning rate
+
+    freeze_modules = cfg.trainer.get("freeze_modules", "")
+    if not isinstance(freeze_modules, str):
+        freeze_modules = ""
+    freeze_patterns = [p.strip() for p in freeze_modules.split(",") if p.strip()]
+
+    used_params = set()
+    frozen_params = set()
+    param_groups = []
+
+    for freeze_path in freeze_patterns:
+        module = model
+        try:
+            for attr in freeze_path.split("."):
+                module = getattr(module, attr)
+            frozen_params.update(id(p) for p in module.parameters())
+        except AttributeError:
+            print(f"⚠️ freeze module path does not exist: {freeze_path}")
+            continue
+
+    for module_name, lr in lr_cfg.items():
+        if module_name == "base":
+            continue
+        # try to find the module under vla by module_name (support nested paths)
+        module = model
+        try:
+            for attr in module_name.split("."):
+                module = getattr(module, attr)
+            # filter out frozen parameters
+            params = [p for p in module.parameters() if id(p) not in frozen_params]
+            if params:  # only add param group if there are trainable parameters
+                param_groups.append({"params": params, "lr": lr, "name": module_name})
+                used_params.update(id(p) for p in params)
+        except AttributeError:
+            ReferenceError(f"⚠️ module path `{module_name}` not found in vla")
+
+    # assign base learning rate to the remaining unused parameters (exclude frozen ones)
+    other_params = [p for p in model.parameters() if id(p) not in used_params and id(p) not in frozen_params]
+    if other_params:
+        param_groups.append({"params": other_params, "lr": base_lr, "name": "base"})
+
+    return param_groups
+
+
+import torch.distributed as dist
+
+
+def _is_main_process_dist() -> bool:
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+
+def only_main_process(func):
+    """
+    decorator: only run in main process (rank=0)
+    """
+
+    def wrapper(*args, **kwargs):
+        if dist.is_initialized() and dist.get_rank() != 0:
+            return None  # non-main process does not execute
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+from torchvision.ops import box_iou
+from PIL import Image
+
+
+def resize_images(images, target_size=(224, 224)):
+    """
+    recursively resize all images in the nested list.
+
+    :param images: nested list of images or single image.
+    :param target_size: target size (width, height) after resizing.
+    :return: resized images list, keeping the original nested structure.
+    """
+    if isinstance(images, Image.Image):  # if it is a single PIL image
+        return images.resize(target_size)
+    elif isinstance(images, list):  # if it is a list, recursively process each element
+        return [resize_images(img, target_size) for img in images]
+    else:
+        raise ValueError("Unsupported image type or structure.")
+
+
+class TrainerUtils:
+    @staticmethod
+    def freeze_backbones(model, freeze_modules=""):
+        """
+        directly freeze the specified submodules based on the relative module path list (patterns), no longer recursively find all submodule names:
+          - patterns: read from config.trainer.freeze_modules, separated by commas to get the "relative path" list
+            for example "qwen_vl_interface, action_model.net",
+            it means to freeze model.qwen_vl_interface and model.action_model.net.
+
+        Args:
+            model: nn.Module model object
+            freeze_modules: relative module path list (patterns)
+
+        Returns:
+            model: nn.Module model object
+        return:
+          - model:
+        """
+        frozen = []
+        print("#"*30)
+        print(freeze_modules)
+        if freeze_modules and type(freeze_modules) == str:
+            # split and remove whitespace
+            patterns = [p.strip() for p in freeze_modules.split(",") if p.strip()] if freeze_modules else []
+
+            for path in patterns:
+                # split the "relative path" by dots, for example "action_model.net" → ["action_model", "net"]
+                attrs = path.split(".")
+                module = model
+                try:
+                    for attr in attrs:
+                        module = getattr(module, attr)
+                    # if the module is successfully get, freeze it and its all submodule parameters
+                    for param in module.parameters():
+                        param.requires_grad = False
+                    frozen.append(path)
+                except AttributeError:
+                    # if the attribute does not exist, skip and print warning
+                    print(f"⚠️ module path does not exist, cannot freeze: {path}")
+                    continue
+
+        # accelerator.wait_for_everyone()  # synchronize when distributed training
+        if _is_main_process_dist():
+            print(f"🔒 Frozen modules with re pattern: {frozen}")
+        return model
+
+    @staticmethod
+    def print_trainable_parameters(model):
+        """
+        print the total number of parameters and trainable parameters of the model
+        :param model: PyTorch model instance
+        """
+        if not _is_main_process_dist():
+            return
+        print("📊 model parameter statistics:")
+        num_params = sum(p.numel() for p in model.parameters())
+        num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        print(
+            f"# Parameters (in millions): {num_params / 10**6:.3f} Total, {num_trainable_params / 10**6:.3f} Trainable"
+        )
+        return num_params, num_trainable_params
+
+    @staticmethod
+    def load_pretrained_backbones(model, checkpoint_path=None, reload_modules=None):
+        """
+        load checkpoint:
+        - if reload_modules is set, load by path part
+        - otherwise → load the entire model parameters (overwrite model)
+
+        return:
+            replace, loaded_modules: list of module paths that successfully loaded parameters; if global load, then ["<full_model>"]
+        """
+        if not checkpoint_path:
+            return []
+        if _is_main_process_dist():
+            print(f"📦 loading checkpoint: {checkpoint_path}")
+        try:
+            if _is_safetensors_path(checkpoint_path):
+                from safetensors.torch import load_file
+
+                checkpoint = load_file(checkpoint_path)
+            else:
+                checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        except Exception as e:
+            raise RuntimeError(f"❌ loading checkpoint failed: {e}")
+
+        loaded_modules = []
+
+        if reload_modules:  # partial load
+            module_paths = [p.strip() for p in reload_modules.split(",") if p.strip()]
+            for path in module_paths:
+                reload_modules = path.split(".")
+                module = model
+                try:
+                    for module_name in reload_modules:  # find the module to modify level by level
+                        module = getattr(module, module_name)
+                    prefix = path + "."
+                    sub_state_dict = {k[len(prefix) :]: v for k, v in checkpoint.items() if k.startswith(prefix)}
+                    if sub_state_dict:
+                        module.load_state_dict(sub_state_dict, strict=True)
+                        if _is_main_process_dist():
+                            print(f"✅ parameters loaded to module '{path}'")
+                        loaded_modules.append(path)
+                    else:
+                        print(f"⚠️ parameters not found in checkpoint '{path}'")
+                except AttributeError:
+                    print(f"❌ cannot find module path: {path}")
+        else:  # full load
+            try:
+                model.load_state_dict(checkpoint, strict=False)
+                if _is_main_process_dist():
+                    print("✅ loaded <full_model> model parameters")
+                loaded_modules = ["<full_model>"]
+            except Exception as e:
+                raise RuntimeError(f"❌ loading full model failed: {e}")
+        return model
+
+    @staticmethod
+    def print_freeze_status(model):
+        """
+        print the freezing status of each parameter in the model
+        :param model: PyTorch model instance
+        """
+        for name, param in model.named_parameters():
+            status = "Frozen" if not param.requires_grad else "Trainable"
+            print(f"{name:60s}  |  {status}")
+
+    @staticmethod
+    def setup_distributed_training(accelerator, *components):
+        """
+        use Accelerator to prepare distributed training components
+        :param accelerator: Accelerate instance
+        :param components: any number of components (such as model, optimizer, dataloader, etc.)
+        :return: prepared distributed components (in the same order as input)
+        """
+
+        # use accelerator.prepare method to wrap components
+        prepared_components = accelerator.prepare(*components)
+        return prepared_components
+
+    def save_full_checkpoint(self, completed_steps, checkpoint_dir, output_dir):
+        """Save full training state (prepared components + RNG) for resume,
+        plus a standalone model weights file for deployment.
+
+        The standalone file format is controlled by ``self.config.trainer.save_format``
+        (``"pt"`` or ``"safetensors"``).  Defaults to ``"pt"`` when unset.
+
+        Must be called after accelerator.prepare().
+
+        Args:
+            completed_steps: Current training step count.
+            checkpoint_dir: Directory to save checkpoints (e.g. results/<run_id>/checkpoints).
+            output_dir: Top-level run directory for summary.jsonl and config.
+        """
+        from pathlib import Path
+
+        save_format = getattr(self.config.trainer, "save_format", "pt")
+
+        # Save full accelerator state for all prepared components.
+        state_dir = os.path.join(checkpoint_dir, f"steps_{completed_steps}")
+        use_safe = save_format == "safetensors"
+        self.accelerator.save_state(state_dir, safe_serialization=use_safe)
+
+        # Save standalone weights & metadata (main process only)
+        if self.accelerator.is_main_process:
+            import json as _json
+
+            # Save standalone model weights for deployment
+            state_dict = self.accelerator.get_state_dict(self.model)
+            if state_dict is not None:
+                if save_format == "safetensors":
+                    from safetensors.torch import save_file
+
+                    weights_path = os.path.join(
+                        checkpoint_dir, f"steps_{completed_steps}_model.safetensors"
+                    )
+                    save_file(state_dict, weights_path)
+                else:
+                    weights_path = os.path.join(
+                        checkpoint_dir, f"steps_{completed_steps}_pytorch_model.pt"
+                    )
+                    torch.save(state_dict, weights_path)
+
+            # Append to summary log
+            summary_data = {"steps": completed_steps}
+            with open(os.path.join(output_dir, "summary.jsonl"), "a") as f:
+                f.write(_json.dumps(summary_data) + "\n")
+
+            self.accelerator.print(f"✅ Checkpoint saved at {state_dir}")
+
+            # Save accessed config if available
+            from starVLA.training.trainer_utils.config_tracker import AccessTrackedConfig
+
+            if isinstance(self.config, AccessTrackedConfig):
+                self.config.save_accessed_config(
+                    Path(output_dir) / "config.yaml",
+                    use_original_values=False,
+                )
+
+        self.accelerator.wait_for_everyone()
+
+    def resume_from_full_checkpoint(self, checkpoint_dir):
+        """Load full training state from an accelerator state directory.
+
+        Must be called **after** accelerator.prepare() (DeepSpeed requirement).
+
+        Args:
+            checkpoint_dir: Path to a steps_N/ directory containing full state.
+
+        Returns:
+            int: The completed_steps parsed from directory name (steps_N), or 0.
+        """
+        self.accelerator.load_state(checkpoint_dir)
+        self.accelerator.print(f"Resumed full training state from: {checkpoint_dir}")
+
+        # Parse completed_steps from directory name (e.g. "steps_5000")
+        dir_name = os.path.basename(checkpoint_dir)
+        match = re.match(r"^steps_(\d+)$", dir_name)
+        return int(match.group(1)) if match else 0
+
+    @staticmethod
+    def euclidean_distance(predicted: np.ndarray, ground_truth: np.ndarray) -> float:
+        return np.linalg.norm(predicted - ground_truth)
+
+    @staticmethod
+    def _reset_dataloader(dataloader, epoch_counter):
+        """safe reset dataloader iterator"""
+        # 1. update epoch counter
+        epoch_counter += 1
+
+        # 2. set new epoch (distributed core)
+        if hasattr(dataloader, "sampler") and callable(getattr(dataloader.sampler, "set_epoch", None)):
+            dataloader.sampler.set_epoch(epoch_counter)
+
+        # 3. create new iterator
+        return iter(dataloader), epoch_counter
+
+    @staticmethod
+    def compute_grad_angle_with_stats(grads_a: list[torch.Tensor], grads_v: list[torch.Tensor]) -> Tuple[float, float]:
+        """
+        compute the cosine angle between two groups of gradient vectors (degrees), and calculate the average angle and variance.
+        grads_a, grads_v: gradient Tensor list corresponding to the same parameter list interface_params
+        return:
+            mean_angle_deg: average angle (degrees)
+            angle_variance: angle variance
+        """
+        angle_degs = []
+
+        # compute the cosine angle between each gradient block grads_a[0].shape = 1280, 3, 14, 14
+        # grads_1 = grads_a[0][0]  # [3, 14, 14]
+        # grads_2 = grads_v[0][0]
+        # grads_a = grads_1.view(-1, 3)  # reshape to [196, 3]
+        # grads_v = grads_2.view(-1, 3)
+
+        # lang linear
+        # reshape to 14*14, 3
+        # layer
+        grads_action = grads_a[0]  # [2048, 11008]
+        grads_action = grads_action[
+            :32, :7
+        ]  # only take the first 7 elements, avoid cosim failure in high-dimensional space
+        grads_vl = grads_v[0]  # [2048, 11008]
+        grads_vl = grads_vl[
+            :32, :7
+        ]  # only take the first 32 elements, 7 dimensions, avoid cosim failure in high-dimensional space
+        for g_a, g_v in zip(grads_action, grads_vl):
+            dot = torch.sum(g_a * g_v)
+            norm_a_sq = torch.sum(g_a * g_a)
+            norm_v_sq = torch.sum(g_v * g_v)
+
+            # avoid division by zero
+            norm_a = torch.sqrt(norm_a_sq + 1e-16)
+            norm_v = torch.sqrt(norm_v_sq + 1e-16)
+
+            cos_sim = (dot / (norm_a * norm_v)).clamp(-1.0, 1.0)
+            angle_rad = torch.acos(cos_sim)
+            angle_deg = angle_rad * (180.0 / torch.pi)
+
+            angle_degs.append(angle_deg.item())
+
+        # compute the average angle and variance
+        angle_degs_tensor = torch.tensor(angle_degs)
+        mean_angle_deg = torch.mean(angle_degs_tensor).item()
+        angle_variance = torch.sqrt(torch.var(angle_degs_tensor)).item()
+        # accelerator.wait_for_everyone()
+        return mean_angle_deg, angle_variance
+
+    @staticmethod
+    def pcgrad_project(grads_a: list[torch.Tensor], grads_v: list[torch.Tensor]) -> list[torch.Tensor]:
+        """
+        apply PCGrad projection to the second group of gradients grads_v, suppress negative transfer between grads_a and grads_v
+        if the dot product of two groups of gradients < 0, then:
+            grads_v <- grads_v - (dot / ||grads_a||^2) * grads_a
+        return the new grads_v list
+        """
+        # first compute dot and ||grads_a||^2
+        dot, norm_a_sq = 0.0, 0.0
+        for g_a, g_v in zip(grads_a, grads_v):
+            dot += torch.sum(g_a * g_v)
+            norm_a_sq += torch.sum(g_a * g_a)
+
+        if dot < 0:
+            coeff = dot / (norm_a_sq + 1e-6)
+            # projection
+            grads_v = [g_v - coeff * g_a for g_a, g_v in zip(grads_a, grads_v)]
+
+        return grads_v
+
+    @staticmethod
+    def l1_distance(predicted: np.ndarray, ground_truth: np.ndarray) -> float:
+        """Mean Absolute Error - 更直观的误差度量"""
+        return np.sum(np.abs(predicted - ground_truth))
+
+    @staticmethod
+    def eval_qwenpi(qwenpi, dataloader, num_batches=20):
+        """
+        evaluate QwenQFormerDiT model, compute IoU and action distance.
+
+        Args:
+            qwenpi: QwenQFormerDiT model instance.
+            dataloader: data loader.
+            num_batches: number of batches to evaluate.
+
+        Returns:
+            dict: contains IoU and action distance evaluation results.
+        """
+        iou_scores = []
+        action_distances = []
+        count = 0
+
+        dataset_iter = iter(dataloader)
+        while count < num_batches:
+            try:
+                batch_samples = next(dataset_iter)
+                count += 1
+            except StopIteration:
+                break
+
+            # extract data
+            images = [example["image"] for example in batch_samples]
+            instructions = [example["lang"] for example in batch_samples]
+            actions = [example["action"] for example in batch_samples]
+            solutions = [example["solution"] for example in batch_samples]
+
+            # model prediction
+            predicted_solutions, normalized_actions = qwenpi.predict_action_withCoT(
+                images=images, instructions=instructions, use_ddim=False, num_ddim_steps=20
+            )
+
+            # extract and convert predicted results
+            parsed_solutions = []
+            for solution in predicted_solutions:
+                parsed_solution = TrainerUtils.extract_json_from_string(solution)
+                parsed_solutions.append(parsed_solution)
+
+            # compute IoU
+            for pred_dict, gt_dict in zip(parsed_solutions, solutions):
+                pred_pick_bbox = torch.tensor(pred_dict["pick"]["bbox_2d"], dtype=torch.float32).unsqueeze(0)
+                gt_pick_bbox = torch.tensor(gt_dict["pick"]["bbox_2d"], dtype=torch.float32).unsqueeze(0)
+                pred_place_bbox = torch.tensor(pred_dict["place"]["bbox_2d"], dtype=torch.float32).unsqueeze(0)
+                gt_place_bbox = torch.tensor(gt_dict["place"]["bbox_2d"], dtype=torch.float32).unsqueeze(0)
+
+                pick_iou = box_iou(pred_pick_bbox, gt_pick_bbox).item()
+                place_iou = box_iou(pred_place_bbox, gt_place_bbox).item()
+
+                iou_scores.append({"pick_iou": pick_iou, "place_iou": place_iou})
+
+            # compute action distance
+            actions = np.array(actions)  # convert to numpy array
+            num_pots = np.prod(actions.shape)  # B*len*dim
+            action_distance = TrainerUtils.euclidean_distance(normalized_actions, actions)
+            average_action_distance = action_distance / num_pots
+            action_distances.append(average_action_distance)
+
+        # summarize results
+        avg_action_distance = np.mean(action_distances)
+        return {"iou_scores": iou_scores, "average_action_distance": avg_action_distance}
+
+    @staticmethod
+    def extract_json_from_string(input_string):
+        """
+        extract valid JSON part from string and convert to dictionary.
+
+        Args:
+            input_string (str): string containing extra characters.
+
+        Returns:
+            dict: dictionary extracted and parsed.
+        """
+        json_match = re.search(r"{.*}", input_string, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(0)
+            try:
+                return json.loads(json_str)
+            except json.JSONDecodeError as e:
+                print(f"JSON decode failed: {e}")
+                return None
+        else:
+            print("No valid JSON part found")
+            return None
+
+    def _get_latest_checkpoint(self, checkpoint_dir):
+        """Find the latest checkpoint in the directory based on step number.
+
+        Supports both new directory format (steps_N/) and legacy file format
+        (steps_N_pytorch_model.pt). Prefers new directory format when both exist
+        at the same step.
+        """
+        if not os.path.exists(checkpoint_dir):
+            self.accelerator.print(f"No checkpoint directory found at {checkpoint_dir}")
+            return None, 0
+
+        checkpoints_with_steps = []
+
+        for entry in os.listdir(checkpoint_dir):
+            full_path = os.path.join(checkpoint_dir, entry)
+
+            # New format: steps_N/ directories (with training_state.json inside)
+            dir_match = re.match(r"^steps_(\d+)$", entry)
+            if dir_match and os.path.isdir(full_path):
+                step = int(dir_match.group(1))
+                # Directory checkpoints contain full accelerator state for resume.
+                checkpoints_with_steps.append((full_path, step, "dir"))
+                continue
+
+            # Weight-only files: steps_N_pytorch_model.pt or steps_N_model.safetensors
+            file_match = re.match(r"^steps_(\d+)_(?:pytorch_model\.pt|model\.safetensors)$", entry)
+            if file_match and os.path.isfile(full_path):
+                step = int(file_match.group(1))
+                checkpoints_with_steps.append((full_path, step, "file"))
+
+        if not checkpoints_with_steps:
+            self.accelerator.print(f"No checkpoints found in {checkpoint_dir}")
+            return None, 0
+
+        # Sort by step number, then by type priority (dir > file) so directory wins ties.
+        type_priority = {"file": 0, "dir": 1}
+        checkpoints_with_steps.sort(key=lambda x: (x[1], type_priority[x[2]]))
+        latest_path, completed_steps, fmt = checkpoints_with_steps[-1]
+
+        self.accelerator.print(f"Latest checkpoint found: {latest_path} (format={fmt})")
+        return latest_path, completed_steps
+
+import os
+
+
+def is_main_process():
+    rank = int(os.environ.get("RANK", 0))  # if RANK is not set, default to 0
+    return rank == 0
+
+
+def _is_safetensors_path(path):
+    """Check if a path refers to a safetensors file."""
+    return str(path).endswith(".safetensors")