haofuly commited on 2 days ago

Commit

45ac12e

verified ·

1 Parent(s): cf587f4

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

capvector-oft/scripts/extern/convert_prismatic_weights_to_hf.py +237 -0
capvector-oft/training_scripts/training.sh +36 -0
capvector-oft/vla-scripts/extern/convert_openvla_weights_to_hf.py +272 -0
capvector-oft/vla-scripts/extern/verify_openvla.py +89 -0
capvector-oft/vla-scripts/finetune.py +1152 -0
capvector-oft/vla-scripts/finetune_regular_loss.py +1790 -0
capvector-oft/vla-scripts/merge_lora_weights_and_save.py +73 -0
capvector-pi05/.dockerignore +3 -0
capvector-pi05/.gitignore +169 -0
capvector-pi05/.gitmodules +6 -0
capvector-pi05/.pre-commit-config.yaml +16 -0
capvector-pi05/.python-version +1 -0
capvector-pi05/LICENSE +201 -0
capvector-pi05/README.md +128 -0
capvector-pi05/capvector/apply_param_diff.py +135 -0
capvector-pi05/capvector/compute_param_diff.py +142 -0
capvector-pi05/docs/docker.md +25 -0
capvector-pi05/docs/norm_stats.md +69 -0
capvector-pi05/docs/remote_inference.md +71 -0
capvector-pi05/examples/aloha_real/Dockerfile +70 -0
capvector-pi05/examples/aloha_real/README.md +126 -0
capvector-pi05/examples/aloha_real/compose.yml +66 -0
capvector-pi05/examples/aloha_real/constants.py +71 -0
capvector-pi05/examples/aloha_real/convert_aloha_data_to_lerobot.py +263 -0
capvector-pi05/examples/aloha_real/env.py +57 -0
capvector-pi05/examples/aloha_real/main.py +51 -0
capvector-pi05/examples/aloha_real/real_env.py +176 -0
capvector-pi05/examples/aloha_real/requirements.in +18 -0
capvector-pi05/examples/aloha_real/requirements.txt +156 -0
capvector-pi05/examples/aloha_real/robot_utils.py +275 -0
capvector-pi05/examples/aloha_real/video_display.py +36 -0
capvector-pi05/examples/aloha_sim/Dockerfile +41 -0
capvector-pi05/examples/aloha_sim/README.md +36 -0
capvector-pi05/examples/aloha_sim/compose.yml +42 -0
capvector-pi05/examples/aloha_sim/env.py +56 -0
capvector-pi05/examples/aloha_sim/main.py +55 -0
capvector-pi05/examples/aloha_sim/requirements.in +8 -0
capvector-pi05/examples/aloha_sim/requirements.txt +132 -0
capvector-pi05/examples/aloha_sim/saver.py +40 -0
capvector-pi05/examples/convert_jax_model_to_pytorch.py +587 -0
capvector-pi05/examples/droid/README.md +84 -0
capvector-pi05/examples/droid/README_train.md +106 -0
capvector-pi05/examples/droid/compute_droid_nonidle_ranges.py +103 -0
capvector-pi05/examples/droid/convert_droid_data_to_lerobot.py +477 -0
capvector-pi05/examples/droid/main.py +246 -0
capvector-pi05/examples/inference.ipynb +137 -0
capvector-pi05/examples/libero/compose.yml +54 -0
capvector-pi05/examples/libero/convert_libero_data_to_lerobot.py +104 -0
capvector-pi05/examples/policy_records.ipynb +134 -0
capvector-pi05/pyproject.toml +142 -0

capvector-oft/scripts/extern/convert_prismatic_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+convert_prismatic_weights_to_hf.py
+Utility script for converting full Prismatic VLM weights (from this repository, in the default "Prismatic" format) to
+the HuggingFace "AutoClasses" (e.g., those defined in `prismatic.extern.hf_*`) for "native" use in `transformers``
+via `trust_remote_code = True`.
+Theoretically, these changes should be fully compatible with directly merging the models into `transformers` down the
+line, with first-class support.
+"""
+import json
+import os
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, List, Union
+import draccus
+import timm
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from timm.models.vision_transformer import LayerScale
+from transformers import AutoTokenizer
+from prismatic.extern.hf.configuration_prismatic import PrismaticConfig
+from prismatic.extern.hf.modeling_prismatic import PrismaticForConditionalGeneration
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+@dataclass
+class HFConvertConfig:
+    # fmt: off
+    prismatic_model_path_or_id: Union[str, Path] = (                    # Path to Pretrained VLM (on disk or HF Hub)
+        "siglip-224px+7b"
+        # "prism-dinosiglip-224px+7b"
+    )
+    output_hf_model_local_path: Path = Path(                            # Path to Local Path to save HF model
+        "hf-convert/prismatic-siglip-224px-7b"
+    )
+    output_hf_model_hub_path: str = (                                   # Path to HF Hub Path for "final" HF model
+        "TRI-ML/prismatic-siglip-224px-7b"                              #   => huggingface.co/TRI-ML/prismatic-{...}
+    )
+    # HF Hub Credentials (required for Gated Models like LLaMa-2)
+    hf_token: Union[str, Path] = Path(".hf_token")                      # Environment variable or Path to HF Token
+    def __post_init__(self) -> None:
+        self.hf_token = self.hf_token.read_text().strip() if isinstance(self.hf_token, Path) else self.hf_token
+    # fmt: on
+# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
+#   =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
+#   =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
+def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
+    return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
+def ls_apply_patch(ls_module: LayerScale):
+    ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
+    ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
+    del ls_module.gamma
+# === Conversion Constants ===
+PROJECTOR_KEY_MAPPING = {
+    "projector.0.weight": "projector.fc1.weight",
+    "projector.0.bias": "projector.fc1.bias",
+    "projector.2.weight": "projector.fc2.weight",
+    "projector.2.bias": "projector.fc2.bias",
+    "projector.4.weight": "projector.fc3.weight",
+    "projector.4.bias": "projector.fc3.bias",
+}
+def remap_state_dicts_for_hf(
+    projector_state_dict: Dict[str, torch.Tensor],
+    llm_backbone_state_dict: Dict[str, torch.Tensor],
+    vision_backbone_state_dicts: List[Dict[str, torch.Tensor]],
+) -> Dict[str, torch.Tensor]:
+    """Iterate through Prismatic component state dictionaries and unify / fix key mapping for HF conversion."""
+    hf_state_dict = {}
+    # Iterate through Projector =>> use `PROJECTOR_KEY_MAPPING`
+    for key, value in projector_state_dict.items():
+        hf_state_dict[PROJECTOR_KEY_MAPPING[key]] = value
+    # Iterate through LLM Backbone =>> replace `llm.` with `language_model.`
+    for key, value in llm_backbone_state_dict.items():
+        hf_state_dict[key.replace("llm.", "language_model.")] = value
+    # Iterate through Vision Backbone =>> add "vision_backbone." prefix
+    assert len(vision_backbone_state_dicts) <= 2, "Prismatic models only support up to 2 (fused) vision backbones!"
+    for idx, vision_backbone_state_dict in enumerate(vision_backbone_state_dicts):
+        prefix = "vision_backbone.featurizer" if idx == 0 else "vision_backbone.fused_featurizer"
+        for key, value in vision_backbone_state_dict.items():
+            hf_state_dict[f"{prefix}.{key}"] = value
+    return hf_state_dict
+@draccus.wrap()
+def convert_prismatic_weights_to_hf(cfg: HFConvertConfig) -> None:
+    print(f"[*] Converting Prismatic Model `{cfg.prismatic_model_path_or_id}` to HF Transformers Format")
+    torch.set_default_dtype(torch.bfloat16)
+    # Get `config.json` and `checkpoint_pt` -- mirrors logic in `prismatic.models.load.py`
+    if os.path.isdir(cfg.prismatic_model_path_or_id):
+        print(f"[*] Loading from Local Path `{(run_dir := Path(cfg.prismatic_model_path_or_id))}`")
+        config_json, checkpoint_pt = run_dir / "config.json", run_dir / "checkpoints" / "latest-checkpoint.pt"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert checkpoint_pt.exists(), f"Missing checkpoint for `{run_dir = }`"
+    else:
+        print(f"[*] Downloading Prismatic Checkpoint from HF Hub :: `TRI-ML/{cfg.prismatic_model_path_or_id}`")
+        config_json = hf_hub_download("TRI-ML/prismatic-vlms", f"{cfg.prismatic_model_path_or_id}/config.json")
+        checkpoint_pt = hf_hub_download(
+            "TRI-ML/prismatic-vlms", f"{cfg.prismatic_model_path_or_id}/checkpoints/latest-checkpoint.pt"
+        )
+    # Load "Native" Config JSON =>> Create LLM Config & Instantiate Tokenizer
+    with open(config_json, "r") as f:
+        prismatic_config = json.load(f)["model"]
+    # Create HF PrismaticConfig (`transformers.PretrainedConfig`)
+    hf_config = PrismaticConfig(
+        vision_backbone_id=prismatic_config["vision_backbone_id"],
+        llm_backbone_id=prismatic_config["llm_backbone_id"],
+        arch_specifier=prismatic_config["arch_specifier"],
+        image_resize_strategy=prismatic_config["image_resize_strategy"],
+        llm_max_length=prismatic_config["llm_max_length"],
+        torch_dtype=torch.bfloat16,
+    )
+    # Instantiate & Add Pad to Tokenizer =>> following `prismatic.models.materialize.get_llm_backbone_and_tokenizer`
+    #   TODO (siddk) :: Implement batched generation -- in which case this should set `padding_side = "left"`!
+    print("[*] Instantiating and Patching Tokenizer, LLM Config")
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_config.hf_llm_id, model_max_length=hf_config.llm_max_length, token=cfg.hf_token, padding_side="right"
+    )
+    tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    tokenizer.init_kwargs.pop("add_prefix_space", None)  # Pop to prevent unnecessary warning on reload...
+    assert tokenizer.pad_token_id == hf_config.pad_token_id, "Incorrect Pad Token ID!"
+    assert len(tokenizer) > hf_config.text_config.vocab_size, "Tokenizer vocabulary must be larger than LLM vocabulary!"
+    # Patch LLM Config in `hf_config` with vocab_size (+ `hf_config.pad_to_multiple_of`), pad_token_id + validate
+    hf_config.text_config.vocab_size += hf_config.pad_to_multiple_of
+    hf_config.text_config.pad_token_id = hf_config.pad_token_id
+    hf_config.text_config.torch_dtype = torch.bfloat16
+    assert hf_config.text_config.use_cache, "LLM config `use_cache` should be True for inference (set default)!"
+    # Create Vision Backbone & Transform =>> following `prismatic.models.materialize.get_vision_backbone_and_transform`
+    #   =>> Deviates a bit from existing code; as such, explicitly tested in `tests/test_image_transforms.py`
+    print("[*] Loading TIMM Vision Backbone(s) and Image Transform(s) =>> Initializing PrismaticImageProcessor")
+    timm_vision_backbones, input_sizes, interpolations, means, stds = [], [], [], [], []
+    for idx, timm_model_id in enumerate(hf_config.timm_model_ids):
+        timm_vision_backbone = timm.create_model(
+            timm_model_id,
+            pretrained=True,
+            num_classes=0,
+            img_size=hf_config.image_sizes[idx],
+            act_layer=hf_config.timm_override_act_layers[idx],
+        )
+        timm_vision_backbones.append(timm_vision_backbone)
+        # Get Per-Backbone Image Processing
+        data_cfg = timm.data.resolve_model_data_config(timm_vision_backbone)
+        input_sizes.append((3, hf_config.image_sizes[idx], hf_config.image_sizes[idx]))
+        interpolations.append(data_cfg["interpolation"])
+        means.append(data_cfg["mean"])
+        stds.append(data_cfg["std"])
+        # Patch `LayerScale` because of HF annoying `fix_key` overwrite...
+        for module in timm_vision_backbone.modules():
+            if isinstance(module, LayerScale):
+                ls_apply_patch(module)
+    # Create PrismaticImageProcessor (`transformers.ImageProcessingMixin`)
+    hf_image_processor = PrismaticImageProcessor(
+        use_fused_vision_backbone=hf_config.use_fused_vision_backbone,
+        image_resize_strategy=hf_config.image_resize_strategy,
+        input_sizes=input_sizes,
+        interpolations=interpolations,
+        means=means,
+        stds=stds,
+    )
+    # Create top-level PrismaticProcessor (`transformers.ProcessorMixin` =>> enables registry w/ AutoProcessor)
+    print("[*] Creating PrismaticProcessor Instance from Tokenizer and PrismaticImageProcessor")
+    hf_processor = PrismaticProcessor(image_processor=hf_image_processor, tokenizer=tokenizer)
+    # Load Prismatic Model State Dictionary (in preparation for conversion)
+    print("[*] Loading Prismatic VLM State Dictionary from Checkpoint")
+    model_state_dict = torch.load(checkpoint_pt, map_location="cpu")["model"]
+    assert ("downsampler" not in model_state_dict) or (len(model_state_dict["downsampler"]) == 0), "Downsampler?"
+    assert ("projector" in model_state_dict) and ("llm_backbone" in model_state_dict), "Missing keys!"
+    # Convert
+    print("[*] Running Conversion")
+    converted_state_dict = remap_state_dicts_for_hf(
+        model_state_dict["projector"],
+        model_state_dict["llm_backbone"],
+        vision_backbone_state_dicts=[vb.state_dict() for vb in timm_vision_backbones],
+    )
+    # Create PrismaticForConditionalGeneration =>> Note that we can't initialize on `meta` device because TIMM
+    print("[*] Building (Randomly Initialized) Model =>> PrismaticForConditionalGeneration")
+    hf_model = PrismaticForConditionalGeneration(hf_config)
+    hf_model.load_state_dict(converted_state_dict, strict=True, assign=True)
+    # Cast Model to BF16 before Saving
+    hf_model.to(torch.bfloat16)
+    # Save Pretrained Versions to Local Path
+    print("[*] Saving Model & Processor to Local Path")
+    hf_model.save_pretrained(cfg.output_hf_model_local_path, max_shard_size="7GB")
+    hf_image_processor.save_pretrained(cfg.output_hf_model_local_path)
+    hf_processor.save_pretrained(cfg.output_hf_model_local_path)
+    # Register AutoClasses
+    PrismaticConfig.register_for_auto_class()
+    PrismaticImageProcessor.register_for_auto_class("AutoImageProcessor")
+    PrismaticProcessor.register_for_auto_class("AutoProcessor")
+    PrismaticForConditionalGeneration.register_for_auto_class("AutoModelForVision2Seq")
+    # Push to Hub
+    print("[*] Pushing Model & Processor to HF Hub")
+    hf_config.push_to_hub(cfg.output_hf_model_hub_path)
+    hf_model.push_to_hub(cfg.output_hf_model_hub_path, max_shard_size="7GB")
+    hf_image_processor.push_to_hub(cfg.output_hf_model_hub_path)
+    hf_processor.push_to_hub(cfg.output_hf_model_hub_path)
+if __name__ == "__main__":
+    convert_prismatic_weights_to_hf()

capvector-oft/training_scripts/training.sh ADDED Viewed

	@@ -0,0 +1,36 @@

+VERSION="v0"
+TASK="10" # spatial / object / goal / 10 / 90
+VLA_PATH="checkpoints/initialized_pt_vla/initailized_openvla_with_SF_spatial_v0.4.2"
+DATA_ROOT_DIR="data/libero_openvla"
+RUN_ROOT_DIR="experiments/training_results"
+REGULARIZATION_LORA_VECTOR_PATH="checkpoints/lora_diff/sf_150000_steps_spatial_adapter_diff.safetensors"
+WANDB_ENTITY="YOUR_WANDB_ENTITY"
+WANDB_PROJECT="YOUR_WANDB_PROJECT"
+EVAL_LOG_PATH="experiments/eval_logs/${VERSION}_output.log"
+torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune_regular_loss.py \
+  --vla_path "$VLA_PATH" \
+  --data_root_dir "$DATA_ROOT_DIR" \
+  --dataset_name libero_${TASK}_no_noops \
+  --run_root_dir "$RUN_ROOT_DIR" \
+  --use_l1_regression True \
+  --use_diffusion False \
+  --use_film False \
+  --num_images_in_input 2 \
+  --use_proprio True \
+  --batch_size 8 \
+  --learning_rate 5e-4 \
+  --scheduler CosineAnnealingLR \
+  --max_steps 150100 \
+  --save_freq 150000 \
+  --save_latest_checkpoint_only True \
+  --merge_lora_during_training True \
+  --regularization_lora_vector_path "$REGULARIZATION_LORA_VECTOR_PATH" \
+  --regularization_weight 1e-4 \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$WANDB_ENTITY" \
+  --wandb_project "$WANDB_PROJECT" \
+  --run_id_override "$VERSION"
+python experiments/robot/libero/run_libero_eval.py  --pretrained_checkpoint "$RUN_ROOT_DIR/$VERSION"  --task_suite_name libero_${TASK} > "$EVAL_LOG_PATH" 2>&1

capvector-oft/vla-scripts/extern/convert_openvla_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+convert_openvla_weights_to_hf.py
+Utility script for converting full OpenVLA VLA weights (from this repository, in the default "Prismatic" format) to
+the HuggingFace "AutoClasses" (e.g., those defined in `prismatic.extern.hf_*`) for "native" use in `transformers``
+via `trust_remote_code = True`.
+Theoretically, these changes should be fully compatible with directly merging the models into `transformers` down the
+line, with first-class support.
+Usage:
+    python vla-scripts/extern/convert_openvla_weights_to_hf.py \
+        --openvla_model_path_or_id <PATH TO PRISMATIC TRAINING RUN DIR> \
+        --output_hf_model_local_path <OUTPUT DIR FOR CONVERTED CHECKPOINT>
+"""
+import json
+import os
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Union
+import draccus
+import timm
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from timm.models.vision_transformer import LayerScale
+from transformers import AutoTokenizer
+from prismatic.conf import ModelConfig
+from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
+from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+@dataclass
+class HFConvertConfig:
+    # fmt: off
+    openvla_model_path_or_id: Union[str, Path] = (                      # Path to Pretrained VLA (on disk or HF Hub)
+        "runs/prism-dinosiglip-224px+mx-oxe-magic-soup-plus+n8+b32+x7"
+    )
+    output_hf_model_local_path: Path = Path(                            # Path to Local Path to save HF model
+        "hf-convert/openvla-7b"
+    )
+    output_hf_model_hub_path: str = "openvla/openvla-7b"                # (Optional) Path to HF Hub Path to push
+                                                                        # model to
+    # HF Hub Credentials (required for Gated Models like LLaMa-2)
+    hf_token: Union[str, Path] = Path(".hf_token")                      # Environment variable or Path to HF Token
+    def __post_init__(self) -> None:
+        self.hf_token = self.hf_token.read_text().strip() if isinstance(self.hf_token, Path) else self.hf_token
+    # fmt: on
+# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
+#   =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
+#   =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
+def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
+    return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
+def ls_apply_patch(ls_module: LayerScale):
+    ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
+    ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
+    del ls_module.gamma
+# === Conversion Constants ===
+PROJECTOR_KEY_MAPPING = {
+    "projector.0.weight": "projector.fc1.weight",
+    "projector.0.bias": "projector.fc1.bias",
+    "projector.2.weight": "projector.fc2.weight",
+    "projector.2.bias": "projector.fc2.bias",
+    "projector.4.weight": "projector.fc3.weight",
+    "projector.4.bias": "projector.fc3.bias",
+}
+def remap_state_dicts_for_hf(
+    prismatic_vision_backbone_state_dict: Dict[str, torch.Tensor],
+    projector_state_dict: Dict[str, torch.Tensor],
+    llm_backbone_state_dict: Dict[str, torch.Tensor],
+    use_fused_vision_backbone: bool = False,
+) -> Dict[str, torch.Tensor]:
+    """Iterate through Prismatic component state dictionaries and unify / fix key mapping for HF conversion."""
+    hf_state_dict = {}
+    # Iterate through Projector =>> use `PROJECTOR_KEY_MAPPING`
+    for key, value in projector_state_dict.items():
+        hf_state_dict[PROJECTOR_KEY_MAPPING[key]] = value
+    # Iterate through LLM Backbone =>> replace `llm.` with `language_model.`
+    for key, value in llm_backbone_state_dict.items():
+        hf_state_dict[key.replace("llm.", "language_model.")] = value
+    # Iterate through Vision Backbone =>> add "vision_backbone." prefix
+    if not use_fused_vision_backbone:
+        for key, value in prismatic_vision_backbone_state_dict.items():
+            hf_state_dict[key.replace("featurizer.", "vision_backbone.featurizer.")] = value
+    else:
+        # Note =>> Assumes that backbones are always DINO + SigLIP...
+        for key, value in prismatic_vision_backbone_state_dict.items():
+            if key.startswith("dino_featurizer"):
+                if key.endswith(".gamma"):
+                    # Handle `LayerScale gamma` =>> DINOv2 only!
+                    key = key.replace(".gamma", ".scale_factor")
+                hf_state_dict[key.replace("dino_featurizer.", "vision_backbone.featurizer.")] = value
+            elif key.startswith("siglip_featurizer"):
+                hf_state_dict[key.replace("siglip_featurizer.", "vision_backbone.fused_featurizer.")] = value
+    return hf_state_dict
+@draccus.wrap()
+def convert_openvla_weights_to_hf(cfg: HFConvertConfig) -> None:
+    print(f"[*] Converting OpenVLA Model `{cfg.openvla_model_path_or_id}` to HF Transformers Format")
+    torch.set_default_dtype(torch.bfloat16)
+    # Get `config.json`, 'dataset_statistics.json' and `checkpoint_pt` -- mirrors logic in `prismatic.models.load.py`
+    if os.path.isdir(cfg.openvla_model_path_or_id):
+        print(f"[*] Loading from Local Path `{(run_dir := Path(cfg.openvla_model_path_or_id))}`")
+        config_json, checkpoint_pt = run_dir / "config.json", run_dir / "checkpoints" / "latest-checkpoint.pt"
+        dataset_statistics_json = run_dir / "dataset_statistics.json"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert checkpoint_pt.exists(), f"Missing checkpoint for `{run_dir = }`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir = }`"
+    else:
+        print(f"[*] Downloading Prismatic Checkpoint from HF Hub :: `TRI-ML/{cfg.openvla_model_path_or_id}`")
+        config_json = hf_hub_download("openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/config.json")
+        checkpoint_pt = hf_hub_download(
+            "openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/checkpoints/latest-checkpoint.pt"
+        )
+        dataset_statistics_json = hf_hub_download(
+            "openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/dataset_statistics.json"
+        )
+    # Load "Native" Config JSON =>> Create LLM Config & Instantiate Tokenizer
+    with open(config_json, "r") as f:
+        vla_cfg = json.load(f)["vla"]
+        prismatic_config = ModelConfig.get_choice_class(vla_cfg["base_vlm"])().__dict__
+    # Load Normalization Statistics
+    with open(dataset_statistics_json, "r") as f:
+        norm_stats = json.load(f)
+    # Create HF OpenVLAConfig (`transformers.PretrainedConfig`)
+    hf_config = OpenVLAConfig(
+        vision_backbone_id=prismatic_config["vision_backbone_id"],
+        llm_backbone_id=prismatic_config["llm_backbone_id"],
+        arch_specifier=prismatic_config["arch_specifier"],
+        image_resize_strategy=prismatic_config["image_resize_strategy"],
+        llm_max_length=prismatic_config["llm_max_length"],
+        torch_dtype=torch.bfloat16,
+        norm_stats=norm_stats,
+    )
+    # Instantiate & Add Pad to Tokenizer =>> following `prismatic.models.materialize.get_llm_backbone_and_tokenizer`
+    #   TODO (siddk) :: Implement batched generation -- in which case this should set `padding_side = "left"`!
+    print("[*] Instantiating and Patching Tokenizer, LLM Config")
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_config.hf_llm_id, model_max_length=hf_config.llm_max_length, token=cfg.hf_token, padding_side="right"
+    )
+    tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    tokenizer.init_kwargs.pop("add_prefix_space", None)  # Pop to prevent unnecessary warning on reload...
+    assert tokenizer.pad_token_id == hf_config.pad_token_id, "Incorrect Pad Token ID!"
+    assert len(tokenizer) > hf_config.text_config.vocab_size, "Tokenizer vocabulary must be larger than LLM vocabulary!"
+    # Patch LLM Config in `hf_config` with vocab_size (+ `hf_config.pad_to_multiple_of`), pad_token_id + validate
+    hf_config.text_config.vocab_size += hf_config.pad_to_multiple_of
+    hf_config.text_config.pad_token_id = hf_config.pad_token_id
+    hf_config.text_config.torch_dtype = torch.bfloat16
+    assert hf_config.text_config.use_cache, "LLM config `use_cache` should be True for inference (set default)!"
+    # Create Vision Backbone & Transform =>> following `prismatic.models.materialize.get_vision_backbone_and_transform`
+    #   =>> Deviates a bit from existing code; as such, explicitly tested in `tests/test_image_transforms.py`
+    print("[*] Loading TIMM Vision Backbone(s) and Image Transform(s) =>> Initializing PrismaticImageProcessor")
+    input_sizes, interpolations, means, stds = [], [], [], []
+    for idx, timm_model_id in enumerate(hf_config.timm_model_ids):
+        timm_vision_backbone = timm.create_model(
+            timm_model_id,
+            pretrained=True,
+            num_classes=0,
+            img_size=hf_config.image_sizes[idx],
+            act_layer=hf_config.timm_override_act_layers[idx],
+        )
+        # Get Per-Backbone Image Processing
+        data_cfg = timm.data.resolve_model_data_config(timm_vision_backbone)
+        input_sizes.append((3, hf_config.image_sizes[idx], hf_config.image_sizes[idx]))
+        interpolations.append(data_cfg["interpolation"])
+        means.append(data_cfg["mean"])
+        stds.append(data_cfg["std"])
+        # Patch `LayerScale` because of HF annoying `fix_key` overwrite...
+        for module in timm_vision_backbone.modules():
+            if isinstance(module, LayerScale):
+                ls_apply_patch(module)
+    # Create PrismaticImageProcessor (`transformers.ImageProcessingMixin`)
+    hf_image_processor = PrismaticImageProcessor(
+        use_fused_vision_backbone=hf_config.use_fused_vision_backbone,
+        image_resize_strategy=hf_config.image_resize_strategy,
+        input_sizes=input_sizes,
+        interpolations=interpolations,
+        means=means,
+        stds=stds,
+    )
+    # Create top-level PrismaticProcessor (`transformers.ProcessorMixin` =>> enables registry w/ AutoProcessor)
+    print("[*] Creating PrismaticProcessor Instance from Tokenizer and PrismaticImageProcessor")
+    hf_processor = PrismaticProcessor(image_processor=hf_image_processor, tokenizer=tokenizer)
+    # Load Prismatic Model State Dictionary (in preparation for conversion)
+    print("[*] Loading Prismatic VLM State Dictionary from Checkpoint")
+    model_state_dict = torch.load(checkpoint_pt, map_location="cpu")["model"]
+    assert ("downsampler" not in model_state_dict) or (len(model_state_dict["downsampler"]) == 0), "Downsampler?"
+    assert all([k in model_state_dict for k in ["vision_backbone", "projector", "llm_backbone"]]), "Missing keys!"
+    # Convert
+    print("[*] Running Conversion")
+    converted_state_dict = remap_state_dicts_for_hf(
+        model_state_dict["vision_backbone"],
+        model_state_dict["projector"],
+        model_state_dict["llm_backbone"],
+        use_fused_vision_backbone=hf_config.use_fused_vision_backbone,
+    )
+    # Create PrismaticForConditionalGeneration =>> Note that we can't initialize on `meta` device because TIMM
+    print("[*] Building (Randomly Initialized) Model =>> OpenVLAForActionPrediction")
+    hf_model = OpenVLAForActionPrediction(hf_config)
+    hf_model.load_state_dict(converted_state_dict, strict=True, assign=True)
+    # Cast Model to BF16 before Saving
+    hf_model.to(torch.bfloat16)
+    # Save Pretrained Versions to Local Path
+    print("[*] Saving Model & Processor to Local Path")
+    hf_model.save_pretrained(cfg.output_hf_model_local_path, max_shard_size="7GB")
+    hf_image_processor.save_pretrained(cfg.output_hf_model_local_path)
+    hf_processor.save_pretrained(cfg.output_hf_model_local_path)
+    # Copy `dataset_statistics.json` File to Converted Checkpoint Directory
+    output_dataset_statistics_json = cfg.output_hf_model_local_path / "dataset_statistics.json"
+    shutil.copyfile(dataset_statistics_json, output_dataset_statistics_json)
+    print(f"[*] Saving Complete! Saved converted checkpoint to: {cfg.output_hf_model_local_path}")
+    #####################################################################################
+    # Optional: Push Model to Hugging Face Hub
+    #####################################################################################
+    # # Register AutoClasses
+    # OpenVLAConfig.register_for_auto_class()
+    # PrismaticImageProcessor.register_for_auto_class("AutoImageProcessor")
+    # PrismaticProcessor.register_for_auto_class("AutoProcessor")
+    # OpenVLAForActionPrediction.register_for_auto_class("AutoModelForVision2Seq")
+    # # Push to HF Hub
+    # print("[*] Pushing Model & Processor to HF Hub")
+    # hf_config.push_to_hub(cfg.output_hf_model_hub_path)
+    # hf_model.push_to_hub(cfg.output_hf_model_hub_path, max_shard_size="7GB")
+    # hf_image_processor.push_to_hub(cfg.output_hf_model_hub_path)
+    # hf_processor.push_to_hub(cfg.output_hf_model_hub_path)
+if __name__ == "__main__":
+    convert_openvla_weights_to_hf()

capvector-oft/vla-scripts/extern/verify_openvla.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+verify_openvla.py
+Given an HF-exported OpenVLA model, attempt to load via AutoClasses, and verify forward() and predict_action().
+"""
+import time
+import numpy as np
+import torch
+from PIL import Image
+from transformers import AutoModelForVision2Seq, AutoProcessor
+# === Verification Arguments
+MODEL_PATH = "openvla/openvla-7b"
+SYSTEM_PROMPT = (
+    "A chat between a curious user and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the user's questions."
+)
+INSTRUCTION = "put spoon on towel"
+def get_openvla_prompt(instruction: str) -> str:
+    if "v01" in MODEL_PATH:
+        return f"{SYSTEM_PROMPT} USER: What action should the robot take to {instruction.lower()}? ASSISTANT:"
+    else:
+        return f"In: What action should the robot take to {instruction.lower()}?\nOut:"
+@torch.inference_mode()
+def verify_openvla() -> None:
+    print(f"[*] Verifying OpenVLAForActionPrediction using Model `{MODEL_PATH}`")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    # Load Processor & VLA
+    print("[*] Instantiating Processor and Pretrained OpenVLA")
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    # === BFLOAT16 + FLASH-ATTN MODE ===
+    print("[*] Loading in BF16 with Flash-Attention Enabled")
+    vla = AutoModelForVision2Seq.from_pretrained(
+        MODEL_PATH,
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    ).to(device)
+    # === 8-BIT QUANTIZATION MODE (`pip install bitsandbytes`) :: [~9GB of VRAM Passive || 10GB of VRAM Active] ===
+    # print("[*] Loading in 8-Bit Quantization Mode")
+    # vla = AutoModelForVision2Seq.from_pretrained(
+    #     MODEL_PATH,
+    #     attn_implementation="flash_attention_2",
+    #     torch_dtype=torch.float16,
+    #     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+    #     low_cpu_mem_usage=True,
+    #     trust_remote_code=True,
+    # )
+    # === 4-BIT QUANTIZATION MODE (`pip install bitsandbytes`) :: [~6GB of VRAM Passive || 7GB of VRAM Active] ===
+    # print("[*] Loading in 4-Bit Quantization Mode")
+    # vla = AutoModelForVision2Seq.from_pretrained(
+    #     MODEL_PATH,
+    #     attn_implementation="flash_attention_2",
+    #     torch_dtype=torch.float16,
+    #     quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+    #     low_cpu_mem_usage=True,
+    #     trust_remote_code=True,
+    # )
+    print("[*] Iterating with Randomly Generated Images")
+    for _ in range(100):
+        prompt = get_openvla_prompt(INSTRUCTION)
+        image = Image.fromarray(np.asarray(np.random.rand(256, 256, 3) * 255, dtype=np.uint8))
+        # === BFLOAT16 MODE ===
+        inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
+        # === 8-BIT/4-BIT QUANTIZATION MODE ===
+        # inputs = processor(prompt, image).to(device, dtype=torch.float16)
+        # Run OpenVLA Inference
+        start_time = time.time()
+        action = vla.predict_action(**inputs, unnorm_key="bridge_orig", do_sample=False)
+        print(f"\t=>> Time: {time.time() - start_time:.4f} || Action: {action}")
+if __name__ == "__main__":
+    verify_openvla()

capvector-oft/vla-scripts/finetune.py ADDED Viewed

	@@ -0,0 +1,1152 @@

+"""
+finetune.py
+Fine-tunes OpenVLA via LoRA.
+"""
+import os
+import time
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Type
+import draccus
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import tqdm
+from accelerate import PartialState
+from huggingface_hub import HfApi, snapshot_download
+from peft import LoraConfig, PeftModel, get_peft_model
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import MultiStepLR
+from torch.utils.data import DataLoader
+from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
+from transformers.modeling_outputs import CausalLMOutputWithPast
+import wandb
+os.environ["WANDB_MODE"]="offline"
+from experiments.robot.openvla_utils import (
+    check_model_logic_mismatch,
+    model_is_on_hf_hub,
+    update_auto_map,
+)
+from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
+from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+from prismatic.models.action_heads import DiffusionActionHead, L1RegressionActionHead
+from prismatic.models.backbones.llm.prompting import PurePromptBuilder
+from prismatic.models.film_vit_wrapper import FiLMedPrismaticVisionBackbone
+from prismatic.models.projectors import (
+    NoisyActionProjector,
+    ProprioProjector,
+)
+from prismatic.training.train_utils import (
+    compute_actions_l1_loss,
+    compute_token_accuracy,
+    get_current_action_mask,
+    get_next_actions_mask,
+)
+from prismatic.util.data_utils import PaddedCollatorForActionPrediction
+from prismatic.vla.action_tokenizer import ActionTokenizer
+from prismatic.vla.constants import (
+    ACTION_DIM,
+    ACTION_PROPRIO_NORMALIZATION_TYPE,
+    NUM_ACTIONS_CHUNK,
+    PROPRIO_DIM,
+)
+from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
+from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics
+# Sane Defaults
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+import debugpy
+try:
+    debugpy.listen(("localhost", 9501))
+    print("Waiting for debugger attach")
+    debugpy.wait_for_client()
+except Exception as e:
+    pass
+@dataclass
+class FinetuneConfig:
+    # fmt: off
+    vla_path: str = "openvla/openvla-7b"             # Path to OpenVLA model (on HuggingFace Hub or stored locally)
+    # Dataset
+    data_root_dir: Path = Path("datasets/rlds")      # Directory containing RLDS datasets
+    dataset_name: str = "aloha_scoop_x_into_bowl"    # Name of fine-tuning dataset (e.g., `aloha_scoop_x_into_bowl`)
+    run_root_dir: Path = Path("runs")                # Path to directory to store logs & checkpoints
+    shuffle_buffer_size: int = 100_000               # Dataloader shuffle buffer size (can reduce if OOM errors occur)
+    # Algorithm and architecture
+    use_l1_regression: bool = True                   # If True, trains continuous action head with L1 regression objective
+    use_diffusion: bool = False                      # If True, trains continuous action head with diffusion modeling objective (DDIM)
+    num_diffusion_steps_train: int = 50              # (When `diffusion==True`) Number of diffusion steps used for training
+    use_film: bool = False                           # If True, uses FiLM to infuse language inputs into visual features
+    num_images_in_input: int = 1                     # Number of images in the VLA input (default: 1)
+    use_proprio: bool = False                        # If True, includes robot proprioceptive state in input
+    # Training configuration
+    batch_size: int = 8                              # Batch size per device (total batch size = batch_size * num GPUs)
+    learning_rate: float = 5e-4                      # Learning rate
+    lr_warmup_steps: int = 0                         # Number of steps to warm up learning rate (from 10% to 100%)
+    num_steps_before_decay: int = 100_000            # Number of steps before LR decays by 10x
+    grad_accumulation_steps: int = 1                 # Number of gradient accumulation steps
+    max_steps: int = 200_000                         # Max number of training steps
+    use_val_set: bool = False                        # If True, uses validation set and log validation metrics
+    val_freq: int = 10_000                           # (When `use_val_set==True`) Validation set logging frequency in steps
+    val_time_limit: int = 180                        # (When `use_val_set==True`) Time limit for computing validation metrics
+    save_freq: int = 10_000                          # Checkpoint saving frequency in steps
+    save_latest_checkpoint_only: bool = False        # If True, saves only 1 checkpoint, overwriting latest checkpoint
+                                                     #   (If False, saves all checkpoints)
+    resume: bool = False                             # If True, resumes from checkpoint
+    resume_step: Optional[int] = None                # (When `resume==True`) Step number that we are resuming from
+    image_aug: bool = True                           # If True, trains with image augmentations (HIGHLY RECOMMENDED)
+    diffusion_sample_freq: int = 50                  # (When `use_diffusion==True`) Frequency for sampling in steps
+    # LoRA
+    use_lora: bool = True                            # If True, uses LoRA fine-tuning
+    lora_rank: int = 32                              # Rank of LoRA weight matrix
+    lora_dropout: float = 0.0                        # Dropout applied to LoRA weights
+    merge_lora_during_training: bool = True          # If True, merges LoRA weights and saves result during training
+                                                     #   Note: Merging can be very slow on some machines. If so, set to
+                                                     #         False and merge final checkpoint offline!
+    # Logging
+    wandb_entity: str = "your-wandb-entity"          # Name of WandB entity
+    wandb_project: str = "your-wandb-project"        # Name of WandB project
+    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
+    run_id_override: Optional[str] = None            # Optional string to override the run ID with
+    wandb_log_freq: int = 10                         # WandB logging frequency in steps
+    # fmt: on
+def remove_ddp_in_checkpoint(state_dict) -> dict:
+    """
+    Removes the 'module.' prefix from parameter names in a PyTorch model state dictionary that was saved using
+    DistributedDataParallel (DDP).
+    When a model is trained using PyTorch's DistributedDataParallel, the saved state dictionary contains parameters
+    prefixed with 'module.'. This function removes these prefixes to make the state dictionary compatible when
+    loading into models that are not yet wrapped in DDP.
+    Args:
+        state_dict (dict): PyTorch model state dictionary.
+    Returns:
+        dict: A new state dictionary with the same contents but with 'module.' prefixes removed from parameter names.
+              Parameters without the 'module.' prefix remain unchanged.
+    """
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k[:7] == "module.":
+            new_state_dict[k[7:]] = v
+        else:
+            new_state_dict[k] = v
+    return new_state_dict
+def get_run_id(cfg) -> str:
+    """
+    Generates or retrieves an identifier string for an experiment run.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+    Returns:
+        str: Experiment run ID.
+    """
+    if cfg.run_id_override is not None:
+        # Override the run ID with the user-provided ID
+        run_id = cfg.run_id_override
+    elif cfg.resume:
+        # Override run ID with the previous resumed run's ID
+        run_id = cfg.vla_path.split("/")[-1]
+        # Remove the "--XXX_chkpt" suffix from the run ID if it exists
+        if "chkpt" in run_id.split("--")[-1]:
+            run_id = "--".join(run_id.split("--")[:-1])
+    else:
+        run_id = (
+            f"{cfg.vla_path.split('/')[-1]}+{cfg.dataset_name}"
+            f"+b{cfg.batch_size * cfg.grad_accumulation_steps}"
+            f"+lr-{cfg.learning_rate}"
+        )
+        if cfg.use_lora:
+            run_id += f"+lora-r{cfg.lora_rank}+dropout-{cfg.lora_dropout}"
+        if cfg.image_aug:
+            run_id += "--image_aug"
+        if cfg.run_id_note is not None:
+            run_id += f"--{cfg.run_id_note}"
+    return run_id
+def load_checkpoint(module_name: str, path: str, step: int, device: str = "cpu") -> dict:
+    """
+    Loads a checkpoint for a given module.
+    Args:
+        module_name (str): Name of model component to load checkpoint for.
+        path (str): Path to checkpoint directory.
+        step (int): Gradient step number of saved checkpoint.
+        device (str): String specifying how to remap storage locations (default = "cpu").
+    Returns:
+        dict: PyTorch model state dictionary.
+    """
+    checkpoint_path = os.path.join(path, f"{module_name}--{step}_checkpoint.pt")
+    print(f"Loading checkpoint: {checkpoint_path}")
+    state_dict = torch.load(checkpoint_path, weights_only=True, map_location=device)
+    return remove_ddp_in_checkpoint(state_dict)
+def wrap_ddp(module: nn.Module, device_id: int, find_unused: bool = False) -> DDP:
+    """
+    Wrap a module with DistributedDataParallel.
+    Args:
+        module (nn.Module): PyTorch module.
+        device_id (str): Device ID.
+        find_unused (bool): Whether to detect parameters without gradients in distributed training.
+    Returns:
+        DistributedDataParallel: PyTorch module wrapped with DDP.
+    """
+    return DDP(module, device_ids=[device_id], find_unused_parameters=find_unused, gradient_as_bucket_view=True)
+def count_parameters(module: nn.Module, name: str) -> None:
+    """
+    Counts and prints the number of trainable parameters in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        module_name (str): Name of model component.
+    Returns:
+        None.
+    """
+    num_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
+    print(f"# trainable params in {name}: {num_params}")
+def init_module(
+    module_class: Type[nn.Module],
+    module_name: str,
+    cfg: FinetuneConfig,
+    device_id: int,
+    module_args: dict,
+    to_bf16: bool = False,
+    find_unused_params: bool = False,
+) -> DDP:
+    """
+    Initializes a module, optionally loads checkpoint, moves to device, and wraps with DDP.
+    Args:
+        module_class (Type[nn.Module]): Class of PyTorch module to initialize.
+        module_name (str): Name of model component to load checkpoint for.
+        cfg (FinetuneConfig): Training configuration.
+        device_id (str): Device ID.
+        module_args (dict): Args for initializing the module.
+        to_bf16 (bool): Whether to convert to torch.bfloat16 data type.
+        find_unused_params (bool): Whether to detect parameters without gradients in distributed training.
+    Returns:
+        DistributedDataParallel: PyTorch module wrapped with DDP.
+    """
+    module = module_class(**module_args)
+    count_parameters(module, module_name)
+    if cfg.resume:
+        state_dict = load_checkpoint(module_name, cfg.vla_path, cfg.resume_step)
+        module.load_state_dict(state_dict)
+    if to_bf16:
+        module = module.to(torch.bfloat16)
+    module = module.to(device_id)
+    return wrap_ddp(module, device_id, find_unused_params)
+def run_forward_pass(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    batch,
+    action_tokenizer,
+    device_id,
+    use_l1_regression,
+    use_diffusion,
+    use_proprio,
+    use_film,
+    num_patches,
+    compute_diffusion_l1=False,
+    num_diffusion_steps_train=None,
+) -> Tuple[torch.Tensor, Dict[str, float]]:
+    """
+    Compute model forward pass and metrics for both training and validation.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        batch (dict): Input batch.
+        action_tokenizer (ActionTokenizer): Action tokenizer.
+        device_id (str): Device ID.
+        use_l1_regression (bool): Whether to use L1 regression.
+        use_diffusion (bool): Whether to use diffusion.
+        use_proprio (bool): Whether to use proprioceptive state as input.
+        use_film (bool): Whether to use FiLM for better language following.
+        num_patches (int): Number of vision patches.
+        compute_diffusion_l1 (bool): Whether to sample actions and compute L1 loss for diffusion (do this once every
+                                    diffusion_sample_freq steps during training; do it every batch for validation)
+        num_diffusion_steps_train (int): Number of diffusion steps for training (only used for diffusion).
+    Returns:
+        tuple: (loss, metrics_dict)
+            loss: The loss tensor with gradient for backpropagation.
+            metrics_dict: Dictionary of computed metrics (detached values for logging).
+    """
+    metrics = {}
+    # Get ground-truth action labels
+    ground_truth_actions = batch["actions"].to(device_id).to(torch.bfloat16)
+    # [Only for diffusion] Sample noisy actions used as input for noise predictor network
+    if use_diffusion:
+        noisy_dict = action_head.module.sample_noisy_actions(ground_truth_actions)
+        noise, noisy_actions, diffusion_timestep_embeddings = (
+            noisy_dict["noise"],
+            noisy_dict["noisy_actions"],
+            noisy_dict["diffusion_timestep_embeddings"],
+        )
+    else:
+        noise, noisy_actions, diffusion_timestep_embeddings = None, None, None
+    # VLA forward pass
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        output: CausalLMOutputWithPast = vla(
+            input_ids=batch["input_ids"].to(device_id),
+            attention_mask=batch["attention_mask"].to(device_id),
+            pixel_values=batch["pixel_values"].to(torch.bfloat16).to(device_id),
+            labels=batch["labels"],
+            output_hidden_states=True,
+            proprio=batch["proprio"] if use_proprio else None,
+            proprio_projector=proprio_projector if use_proprio else None,
+            noisy_actions=noisy_actions if use_diffusion else None,
+            noisy_action_projector=noisy_action_projector if use_diffusion else None,
+            diffusion_timestep_embeddings=diffusion_timestep_embeddings if use_diffusion else None,
+            use_film=use_film,
+        )
+    # Get action masks needed for logging
+    ground_truth_token_ids = batch["labels"][:, 1:].to(device_id)
+    current_action_mask = get_current_action_mask(ground_truth_token_ids)
+    next_actions_mask = get_next_actions_mask(ground_truth_token_ids)
+    # Compute metrics for discrete action representation (next-token prediction)
+    if not (use_l1_regression or use_diffusion):
+        loss = output.loss
+        predicted_token_ids = output.logits[:, num_patches:-1].argmax(dim=2)
+        curr_action_accuracy = compute_token_accuracy(
+            predicted_token_ids, ground_truth_token_ids, mask=current_action_mask
+        )
+        curr_action_l1_loss = compute_actions_l1_loss(
+            action_tokenizer, predicted_token_ids, ground_truth_token_ids, mask=current_action_mask
+        )
+        next_actions_accuracy = compute_token_accuracy(
+            predicted_token_ids, ground_truth_token_ids, mask=next_actions_mask
+        )
+        next_actions_l1_loss = compute_actions_l1_loss(
+            action_tokenizer, predicted_token_ids, ground_truth_token_ids, mask=next_actions_mask
+        )
+        metrics.update(
+            {
+                "loss_value": loss.item(),  # Detached value for logging
+                "curr_action_accuracy": curr_action_accuracy.item(),
+                "curr_action_l1_loss": curr_action_l1_loss.item(),
+                "next_actions_accuracy": next_actions_accuracy.item(),
+                "next_actions_l1_loss": next_actions_l1_loss.item(),
+            }
+        )
+    # Compute metrics for continuous action representations (L1 regression | diffusion)
+    else:
+        # Get last layer hidden states
+        last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
+        # Get hidden states for text portion of prompt+response (after the vision patches)
+        text_hidden_states = last_hidden_states[:, num_patches:-1]
+        # Get hidden states for action portion of response
+        batch_size = batch["input_ids"].shape[0]
+        actions_hidden_states = (
+            text_hidden_states[current_action_mask | next_actions_mask]
+            .reshape(batch_size, NUM_ACTIONS_CHUNK * ACTION_DIM, -1)
+            .to(torch.bfloat16)
+        )  # (B, act_chunk_len, D)
+        if use_l1_regression:
+            # Predict action
+            predicted_actions = action_head.module.predict_action(actions_hidden_states)
+            # Get full L1 loss
+            loss = torch.nn.L1Loss()(ground_truth_actions, predicted_actions)
+        if use_diffusion:
+            # Predict noise
+            noise_pred = action_head.module.predict_noise(actions_hidden_states)
+            # Get diffusion noise prediction MSE loss
+            noise_pred = noise_pred.reshape(noise.shape)
+            loss = nn.functional.mse_loss(noise_pred, noise, reduction="mean")
+            # Only sample actions and compute L1 losses if specified
+            if compute_diffusion_l1:
+                with torch.no_grad():
+                    predicted_actions = run_diffusion_sampling(
+                        vla=vla,
+                        action_head=action_head,
+                        noisy_action_projector=noisy_action_projector,
+                        proprio_projector=proprio_projector,
+                        batch=batch,
+                        batch_size=batch_size,
+                        num_patches=num_patches,
+                        actions_shape=ground_truth_actions.shape,
+                        device_id=device_id,
+                        current_action_mask=current_action_mask,
+                        next_actions_mask=next_actions_mask,
+                        use_proprio=use_proprio,
+                        use_film=use_film,
+                    )
+        metrics.update(
+            {
+                "loss_value": loss.item(),  # Detached value for logging
+            }
+        )
+        # Get detailed L1 losses for logging
+        should_log_l1_loss = not use_diffusion or (use_diffusion and compute_diffusion_l1)
+        if should_log_l1_loss:
+            ground_truth_curr_action = ground_truth_actions[:, 0]
+            predicted_curr_action = predicted_actions[:, 0]
+            ground_truth_next_actions = ground_truth_actions[:, 1:]
+            predicted_next_actions = predicted_actions[:, 1:]
+            curr_action_l1_loss = torch.nn.L1Loss()(ground_truth_curr_action, predicted_curr_action)
+            next_actions_l1_loss = torch.nn.L1Loss()(ground_truth_next_actions, predicted_next_actions)
+            metrics.update(
+                {
+                    "curr_action_l1_loss": curr_action_l1_loss.item(),
+                    "next_actions_l1_loss": next_actions_l1_loss.item(),
+                }
+            )
+    # Return both the loss tensor (with gradients) and the metrics dictionary (with detached values)
+    return loss, metrics
+def run_diffusion_sampling(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    batch,
+    batch_size,
+    num_patches,
+    actions_shape,
+    device_id,
+    current_action_mask,
+    next_actions_mask,
+    use_proprio,
+    use_film,
+) -> torch.Tensor:
+    """
+    Run diffusion sampling (reverse diffusion) to generate actions.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        batch (dict): Input batch.
+        batch_size (int): Batch size.
+        num_patches (int): Number of vision patches.
+        actions_shape (tuple): Shape of ground-truth actions.
+        device_id (str): Device ID.
+        current_action_mask (torch.Tensor): Mask for current action.
+        next_actions_mask (torch.Tensor): Mask for next actions.
+        use_proprio (bool): Whether to use proprioceptive state as input.
+        use_film (bool): Whether to use FiLM for better language following.
+    Returns:
+        torch.Tensor: Predicted actions.
+    """
+    # Sample random noisy action, used as the starting point for reverse diffusion
+    noise = torch.randn(
+        size=(batch_size, NUM_ACTIONS_CHUNK, ACTION_DIM),
+        device=device_id,
+        dtype=torch.bfloat16,
+    )  # (B, chunk_len, action_dim)
+    # Set diffusion timestep values
+    action_head.module.noise_scheduler.set_timesteps(action_head.module.num_diffusion_steps_train)
+    # Reverse diffusion: Iteratively denoise to generate action, conditioned on observation
+    curr_noisy_actions = noise
+    for t in action_head.module.noise_scheduler.timesteps:
+        # Get diffusion model's noise prediction (conditioned on VLA latent embedding, current noisy action embedding,
+        # and diffusion timestep embedding)
+        timesteps = torch.Tensor([t]).repeat(batch_size).to(device_id)
+        diffusion_timestep_embeddings = (
+            action_head.module.time_encoder(timesteps).to(curr_noisy_actions.dtype).to(curr_noisy_actions.device)
+        )  # (B, llm_dim)
+        diffusion_timestep_embeddings = diffusion_timestep_embeddings.unsqueeze(1)  # (B, 1, llm_dim)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            output = vla(
+                input_ids=batch["input_ids"].to(device_id),
+                attention_mask=batch["attention_mask"].to(device_id),
+                pixel_values=batch["pixel_values"].to(torch.bfloat16).to(device_id),
+                labels=batch["labels"],
+                output_hidden_states=True,
+                proprio=batch["proprio"] if use_proprio else None,
+                proprio_projector=proprio_projector if use_proprio else None,
+                noisy_actions=curr_noisy_actions,
+                noisy_action_projector=noisy_action_projector,
+                diffusion_timestep_embeddings=diffusion_timestep_embeddings,
+                use_film=use_film,
+            )
+            # Get last layer hidden states
+            last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
+            # Get hidden states for text portion of prompt+response (after the vision patches)
+            text_hidden_states = last_hidden_states[:, num_patches:-1]
+            # Get hidden states for action portion of response
+            actions_hidden_states = text_hidden_states[current_action_mask | next_actions_mask].reshape(
+                batch_size, NUM_ACTIONS_CHUNK * ACTION_DIM, -1
+            )  # (B, act_chunk_len, D)
+            actions_hidden_states = actions_hidden_states.to(torch.bfloat16)
+            # Predict noise
+            noise_pred = action_head.module.predict_noise(actions_hidden_states)
+        # Compute the action at the previous diffusion timestep: x_t -> x_{t-1}
+        curr_noisy_actions = action_head.module.noise_scheduler.step(noise_pred, t, curr_noisy_actions).prev_sample
+    return curr_noisy_actions.reshape(actions_shape)
+def compute_smoothened_metrics(metrics_deques) -> dict:
+    """
+    Compute smoothened metrics from recent deques.
+    Args:
+        metrics_deques (dict): Dictionary of deques containing recent metrics.
+    Returns:
+        dict: Dictionary of smoothened metrics.
+    """
+    smoothened_metrics = {}
+    for name, deque in metrics_deques.items():
+        if deque and len(deque) > 0:
+            smoothened_metrics[name] = sum(deque) / len(deque)
+    return smoothened_metrics
+def log_metrics_to_wandb(metrics, prefix, step, wandb_entity) -> None:
+    """
+    Log metrics to Weights & Biases.
+    Args:
+        metrics (dict): Dictionary of metrics to log
+        prefix (str): Prefix for metric names
+        step (int): Training step
+        wandb_entity (str): W&B entity instance
+    Returns:
+        None.
+    """
+    log_dict = {}
+    for name, value in metrics.items():
+        # Map loss_value to Loss for better readability in W&B
+        if name == "loss_value":
+            log_dict[f"{prefix}/Loss"] = value
+        # Keep other metrics as is
+        else:
+            log_dict[f"{prefix}/{name.replace('_', ' ').title()}"] = value
+    wandb_entity.log(log_dict, step=step)
+def save_training_checkpoint(
+    cfg,
+    run_dir,
+    log_step,
+    vla,
+    processor,
+    proprio_projector,
+    noisy_action_projector,
+    action_head,
+    train_dataset,
+    distributed_state,
+) -> None:
+    """
+    Save all training checkpoints including model components, LoRA adapter, and dataset statistics.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+        run_dir (Path): Experiment run directory path.
+        log_step (int): Current logging step.
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        processor (PrismaticProcessor): OpenVLA inputs processor.
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        action_head (nn.Module): Action head module.
+        train_dataset (RLDSDataset): Training dataset.
+        distributed_state (PartialState): Distributed training state.
+    Returns:
+        None.
+    """
+    # Determine checkpoint paths and naming
+    if cfg.save_latest_checkpoint_only:
+        checkpoint_dir = run_dir
+        checkpoint_name_suffix = "latest_checkpoint.pt"
+    else:
+        checkpoint_dir = Path(str(run_dir) + f"--{log_step}_chkpt")
+        checkpoint_name_suffix = f"{log_step}_checkpoint.pt"
+    adapter_dir = checkpoint_dir / "lora_adapter"
+    # Create directories and save dataset statistics (main process only)
+    if distributed_state.is_main_process:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        os.makedirs(adapter_dir, exist_ok=True)
+        save_dataset_statistics(train_dataset.dataset_statistics, checkpoint_dir)
+        print(f"Saving Model Checkpoint for Step {log_step}")
+    # Wait for directories to be created
+    dist.barrier()
+    # Save model components (main process only)
+    if distributed_state.is_main_process:
+        # Save processor and LoRA adapter
+        processor.save_pretrained(checkpoint_dir)
+        vla.module.save_pretrained(adapter_dir)
+        # Save other components
+        if cfg.use_proprio and proprio_projector is not None:
+            torch.save(proprio_projector.state_dict(), checkpoint_dir / f"proprio_projector--{checkpoint_name_suffix}")
+        if cfg.use_diffusion and noisy_action_projector is not None:
+            torch.save(
+                noisy_action_projector.state_dict(), checkpoint_dir / f"noisy_action_projector--{checkpoint_name_suffix}"
+            )
+        if (cfg.use_l1_regression or cfg.use_diffusion) and action_head is not None:
+            torch.save(action_head.state_dict(), checkpoint_dir / f"action_head--{checkpoint_name_suffix}")
+        if cfg.use_film:
+            # To be safe, just save the entire vision backbone (not just FiLM components)
+            torch.save(
+                vla.module.vision_backbone.state_dict(), checkpoint_dir / f"vision_backbone--{checkpoint_name_suffix}"
+            )
+    # Wait for model components to be saved
+    dist.barrier()
+    # Merge LoRA weights into base model and save resulting model checkpoint
+    # Note: Can be very slow on some devices; if so, we recommend merging offline
+    if cfg.use_lora and cfg.merge_lora_during_training:
+        base_vla = AutoModelForVision2Seq.from_pretrained(
+            cfg.vla_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True
+        )
+        merged_vla = PeftModel.from_pretrained(base_vla, adapter_dir)
+        merged_vla = merged_vla.merge_and_unload()
+        if distributed_state.is_main_process:
+            merged_vla.save_pretrained(checkpoint_dir)
+            print(f"Saved merged model for Step {log_step} at: {checkpoint_dir}")
+        # Wait for merged model to be saved
+        dist.barrier()
+def run_validation(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    val_dataloader,
+    action_tokenizer,
+    device_id,
+    cfg,
+    num_patches,
+    log_step,
+    distributed_state,
+    val_time_limit,
+) -> None:
+    """
+    Compute validation set metrics for logging.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        val_dataloader (DataLoader): Validation data loader.
+        action_tokenizer (ActionTokenizer): Action tokenizer.
+        device_id (str): Device ID.
+        cfg (FinetuneConfig): Training configuration.
+        num_patches (int): Number of vision patches.
+        log_step (int): Current logging step.
+        distributed_state (PartialState): Distributed training state.
+        val_time_limit (int): Time limit for computing validation metrics.
+    Returns:
+        None.
+    """
+    val_start_time = time.time()
+    vla.eval()
+    val_batches_count = 0
+    # List to store validation metrics
+    all_val_metrics = []
+    with torch.no_grad():
+        for batch in val_dataloader:
+            # Always compute L1 loss for validation, even for diffusion
+            _, metrics = run_forward_pass(
+                vla=vla,
+                action_head=action_head,
+                noisy_action_projector=noisy_action_projector,
+                proprio_projector=proprio_projector,
+                batch=batch,
+                action_tokenizer=action_tokenizer,
+                device_id=device_id,
+                use_l1_regression=cfg.use_l1_regression,
+                use_diffusion=cfg.use_diffusion,
+                use_proprio=cfg.use_proprio,
+                use_film=cfg.use_film,
+                num_patches=num_patches,
+                compute_diffusion_l1=True,
+                num_diffusion_steps_train=cfg.num_diffusion_steps_train if cfg.use_diffusion else None,
+            )
+            # Add the loss value to the metrics
+            metrics["loss"] = metrics["loss_value"]
+            all_val_metrics.append(metrics)
+            val_batches_count += 1
+            # Cut testing on validation set short if it exceeds time limit
+            if time.time() - val_start_time > val_time_limit:
+                break
+    # Compute average validation metrics
+    avg_val_metrics = {}
+    for metric_name in all_val_metrics[0].keys():
+        values = [metrics[metric_name] for metrics in all_val_metrics if metric_name in metrics]
+        if values:
+            avg_val_metrics[metric_name] = sum(values) / len(values)
+    # Add batch count to metrics
+    avg_val_metrics["val_batches_count"] = val_batches_count
+    # Log validation metrics to W&B
+    if distributed_state.is_main_process:
+        log_metrics_to_wandb(avg_val_metrics, "VLA Val", log_step, wandb)
+@draccus.wrap()
+def finetune(cfg: FinetuneConfig) -> None:
+    """
+    Fine-tunes base VLA on demonstration dataset via LoRA.
+    Allows toggling different action representations (discrete vs. continuous), different learning objectives
+    (next-token prediction vs. L1 regression vs. diffusion), FiLM. Also allows for additional model inputs,
+    such as additional camera images and robot proprioceptive state. Assumes parallel action generation with
+    action chunking.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+    Returns:
+        None.
+    """
+    assert cfg.use_lora, "Only LoRA fine-tuning is supported. Please set --use_lora=True!"
+    assert not (cfg.use_l1_regression and cfg.use_diffusion), (
+        "Cannot do both L1 regression and diffusion. Please pick one of them!"
+    )
+    # Trim trailing forward slash ('/') in VLA path if it exists
+    cfg.vla_path = cfg.vla_path.rstrip("/")
+    print(f"Fine-tuning OpenVLA Model `{cfg.vla_path}` on `{cfg.dataset_name}`")
+    # Get experiment run ID
+    run_id = get_run_id(cfg)
+    # Create experiment run directory
+    run_dir = cfg.run_root_dir / run_id
+    os.makedirs(run_dir, exist_ok=True)
+    # GPU setup
+    distributed_state = PartialState()
+    device_id = distributed_state.local_process_index
+    torch.cuda.set_device(device_id)
+    torch.cuda.empty_cache()
+    # Initialize wandb logging
+    if distributed_state.is_main_process:
+        wandb.init(entity=cfg.wandb_entity, project=cfg.wandb_project, name=run_id)
+    # Print detected constants
+    print(
+        "Detected constants:\n"
+        f"\tNUM_ACTIONS_CHUNK: {NUM_ACTIONS_CHUNK}\n"
+        f"\tACTION_DIM: {ACTION_DIM}\n"
+        f"\tPROPRIO_DIM: {PROPRIO_DIM}\n"
+        f"\tACTION_PROPRIO_NORMALIZATION_TYPE: {ACTION_PROPRIO_NORMALIZATION_TYPE}"
+    )
+    # Two options:
+    # (1) Base model is on Hugging Face Hub
+    #   - Then download it and record the path to the download directory
+    # (2) Base model is stored locally
+    #   - Then register model config in HF Auto Classes
+    # In both cases, we want to check whether any changes have been made to
+    # the `modeling_prismatic.py` file in this codebase; if so, we will copy
+    # the file to the downloaded or locally stored checkpoint directory so
+    # that the user's changes to the VLA class logic go into effect
+    if model_is_on_hf_hub(cfg.vla_path):
+        # Download model directly from Hugging Face Hub
+        vla_download_path = snapshot_download(repo_id=cfg.vla_path)
+        # Overwrite VLA path
+        cfg.vla_path = vla_download_path
+    else:
+        # Register OpenVLA model to HF Auto Classes (not needed if the model is on HF Hub)
+        AutoConfig.register("openvla", OpenVLAConfig)
+        AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
+        AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
+        AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)
+    # Update config.json and sync model files
+    if distributed_state.is_main_process:
+        update_auto_map(cfg.vla_path)
+        check_model_logic_mismatch(cfg.vla_path)
+    # Wait for model files to be synced
+    dist.barrier()
+    # Load processor and VLA
+    processor = AutoProcessor.from_pretrained(cfg.vla_path, trust_remote_code=True)
+    vla = AutoModelForVision2Seq.from_pretrained(
+        cfg.vla_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    ).to(device_id)
+    # Set number of images in VLA input
+    vla.vision_backbone.set_num_images_in_input(cfg.num_images_in_input)
+    # LoRA setup
+    if cfg.use_lora:
+        lora_config = LoraConfig(
+            r=cfg.lora_rank,
+            lora_alpha=min(cfg.lora_rank, 16),
+            lora_dropout=cfg.lora_dropout,
+            target_modules="all-linear",
+            init_lora_weights="gaussian",
+        )
+        vla = get_peft_model(vla, lora_config)
+        vla.print_trainable_parameters()
+    # FiLM setup
+    if cfg.use_film:
+        count_parameters(vla.vision_backbone, "vla.vision_backbone (original)")
+        # Wrap vision backbone with FiLM wrapper
+        # Important: For this, must specify `vla.model.vision_backbone` instead of just `vla.vision_backbone`, since the
+        # latter would cause the new wrapped backbone to be saved as a new attribute of `vla` instead of overwriting the
+        # original one (due to the LoRA wrapper)
+        vla.model.vision_backbone = FiLMedPrismaticVisionBackbone(
+            vision_backbone=vla.model.vision_backbone,
+            llm_dim=vla.llm_dim,
+        )
+        count_parameters(vla.vision_backbone, "vla.vision_backbone (post-wrap)")
+        if cfg.resume:
+            state_dict = load_checkpoint("vision_backbone", cfg.vla_path, cfg.resume_step)
+            vla.model.vision_backbone.load_state_dict(state_dict)
+        vla.model.vision_backbone = vla.model.vision_backbone.to(device_id)
+    # Wrap VLA with DDP
+    vla = wrap_ddp(vla, device_id, find_unused=True)
+    # If applicable, instantiate proprio projector
+    if cfg.use_proprio:
+        proprio_projector = init_module(
+            ProprioProjector,
+            "proprio_projector",
+            cfg,
+            device_id,
+            {"llm_dim": vla.module.llm_dim, "proprio_dim": PROPRIO_DIM},
+        )
+    # If applicable, instantiate continuous action head for L1 regression
+    if cfg.use_l1_regression:
+        action_head = init_module(
+            L1RegressionActionHead,
+            "action_head",
+            cfg,
+            device_id,
+            {"input_dim": vla.module.llm_dim, "hidden_dim": vla.module.llm_dim, "action_dim": ACTION_DIM},
+            to_bf16=True,
+        )
+    # If applicable, instantiate diffusion action head and noisy action projector
+    if cfg.use_diffusion:
+        action_head = init_module(
+            DiffusionActionHead,
+            "action_head",
+            cfg,
+            device_id,
+            {
+                "input_dim": vla.module.llm_dim,
+                "hidden_dim": vla.module.llm_dim,
+                "action_dim": ACTION_DIM,
+                "num_diffusion_steps_train": cfg.num_diffusion_steps_train,
+            },
+            to_bf16=True,
+        )
+        noisy_action_projector = init_module(
+            NoisyActionProjector, "noisy_action_projector", cfg, device_id, {"llm_dim": vla.module.llm_dim}
+        )
+    # Get number of vision patches
+    NUM_PATCHES = vla.module.vision_backbone.get_num_patches() * vla.module.vision_backbone.get_num_images_in_input()
+    # If we have proprio inputs, a single proprio embedding is appended to the end of the vision patch embeddings
+    if cfg.use_proprio:
+        NUM_PATCHES += 1
+    # For diffusion, a single diffusion timestep embedding is appended to the end of the vision patch embeddings
+    if cfg.use_diffusion:
+        NUM_PATCHES += 1
+    # Instantiate optimizer
+    trainable_params = [param for param in vla.parameters() if param.requires_grad]
+    if cfg.use_l1_regression or cfg.use_diffusion:
+        trainable_params += [param for param in action_head.parameters() if param.requires_grad]
+    if cfg.use_diffusion:
+        trainable_params += [param for param in noisy_action_projector.parameters() if param.requires_grad]
+    if cfg.use_proprio:
+        trainable_params += [param for param in proprio_projector.parameters() if param.requires_grad]
+    print(f"# total trainable params: {sum(p.numel() for p in trainable_params)}")
+    optimizer = AdamW(trainable_params, lr=cfg.learning_rate)
+    # Record original learning rate
+    original_lr = optimizer.param_groups[0]["lr"]
+    # Create learning rate scheduler
+    scheduler = MultiStepLR(
+        optimizer,
+        milestones=[cfg.num_steps_before_decay],  # Number of steps after which LR will change
+        gamma=0.1,  # Multiplicative factor of learning rate decay
+    )
+    # Create Action Tokenizer
+    action_tokenizer = ActionTokenizer(processor.tokenizer)
+    # Load Fine-tuning Dataset =>> note that we use an RLDS-formatted dataset following Open X-Embodiment by default.
+    #   =>> If you want to use a non-RLDS dataset (e.g., a standard PyTorch Dataset) see the following commented block.
+    #   =>> Note that our training code does not loop over epochs because the RLDS loader does this implicitly; if using
+    #       your own Dataset, make sure to add the appropriate logic to the training loop!
+    #
+    # ---
+    # from prismatic.vla.datasets import DummyDataset
+    #
+    # train_dataset = DummyDataset(
+    #     action_tokenizer,
+    #     processor.tokenizer,
+    #     image_transform=processor.image_processor.apply_transform,
+    #     prompt_builder_fn=PurePromptBuilder,
+    # )
+    # ---
+    # We assume that the model takes as input one third-person camera image and 1 or 2 optional wrist camera image(s)
+    use_wrist_image = cfg.num_images_in_input > 1
+    # Create training and optional validation datasets
+    batch_transform = RLDSBatchTransform(
+        action_tokenizer,
+        processor.tokenizer,
+        image_transform=processor.image_processor.apply_transform,
+        prompt_builder_fn=PurePromptBuilder,
+        use_wrist_image=use_wrist_image,
+        use_proprio=cfg.use_proprio,
+    )
+    train_dataset = RLDSDataset(
+        cfg.data_root_dir,
+        cfg.dataset_name,
+        batch_transform,
+        resize_resolution=tuple(vla.module.config.image_sizes),
+        shuffle_buffer_size=cfg.shuffle_buffer_size,
+        image_aug=cfg.image_aug,
+    )
+    if cfg.use_val_set:
+        val_dataset = RLDSDataset(
+            cfg.data_root_dir,
+            cfg.dataset_name,
+            batch_transform,
+            resize_resolution=tuple(vla.module.config.image_sizes),
+            shuffle_buffer_size=cfg.shuffle_buffer_size // 10,
+            image_aug=cfg.image_aug,
+            train=False,
+        )
+    # [Important] Save dataset statistics so that we can unnormalize actions during inference
+    if distributed_state.is_main_process:
+        save_dataset_statistics(train_dataset.dataset_statistics, run_dir)
+    # Create collator and dataloader
+    collator = PaddedCollatorForActionPrediction(
+        processor.tokenizer.model_max_length, processor.tokenizer.pad_token_id, padding_side="right"
+    )
+    dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.batch_size,
+        sampler=None,
+        collate_fn=collator,
+        num_workers=0,  # Important: Set to 0 if using RLDS, which uses its own parallelism
+    )
+    if cfg.use_val_set:
+        val_batch_size = cfg.batch_size
+        val_dataloader = DataLoader(
+            val_dataset,
+            batch_size=val_batch_size,
+            sampler=None,
+            collate_fn=collator,
+            num_workers=0,  # Important: Set to 0 if using RLDS, which uses its own parallelism
+        )
+    # Deque to store recent train metrics (used for computing smoothened metrics for gradient accumulation)
+    recent_metrics = {
+        "loss_value": deque(maxlen=cfg.grad_accumulation_steps),
+        "curr_action_accuracy": deque(maxlen=cfg.grad_accumulation_steps),
+        "curr_action_l1_loss": deque(maxlen=cfg.grad_accumulation_steps),
+        "next_actions_accuracy": deque(maxlen=cfg.grad_accumulation_steps),
+        "next_actions_l1_loss": deque(maxlen=cfg.grad_accumulation_steps),
+    }
+    # Start training
+    with tqdm.tqdm(total=cfg.max_steps, leave=False) as progress:
+        vla.train()
+        optimizer.zero_grad()
+        for batch_idx, batch in enumerate(dataloader):
+            # Compute training metrics and loss
+            compute_diffusion_l1 = cfg.use_diffusion and batch_idx % cfg.diffusion_sample_freq == 0
+            loss, metrics = run_forward_pass(
+                vla=vla,
+                action_head=action_head,
+                noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                proprio_projector=proprio_projector if cfg.use_proprio else None,
+                batch=batch,
+                action_tokenizer=action_tokenizer,
+                device_id=device_id,
+                use_l1_regression=cfg.use_l1_regression,
+                use_diffusion=cfg.use_diffusion,
+                use_proprio=cfg.use_proprio,
+                use_film=cfg.use_film,
+                num_patches=NUM_PATCHES,
+                compute_diffusion_l1=compute_diffusion_l1,
+                num_diffusion_steps_train=cfg.num_diffusion_steps_train if cfg.use_diffusion else None,
+            )
+            # Normalize loss to account for gradient accumulation
+            normalized_loss = loss / cfg.grad_accumulation_steps
+            # Backward pass
+            normalized_loss.backward()
+            # Store recent train metrics
+            for metric_name, value in metrics.items():
+                if metric_name in recent_metrics:
+                    recent_metrics[metric_name].append(value)
+            # Compute gradient step index
+            gradient_step_idx = batch_idx // cfg.grad_accumulation_steps
+            # Compute smoothened train metrics
+            smoothened_metrics = compute_smoothened_metrics(recent_metrics)
+            # Push Metrics to W&B (every wandb_log_freq gradient steps)
+            log_step = gradient_step_idx if not cfg.resume else cfg.resume_step + gradient_step_idx
+            if distributed_state.is_main_process and log_step % cfg.wandb_log_freq == 0:
+                log_metrics_to_wandb(smoothened_metrics, "VLA Train", log_step, wandb)
+            # [If applicable] Linearly warm up learning rate from 10% to 100% of original
+            if cfg.lr_warmup_steps > 0:
+                lr_progress = min((gradient_step_idx + 1) / cfg.lr_warmup_steps, 1.0)  # Cap at 1.0
+                current_lr = original_lr * (0.1 + 0.9 * lr_progress)
+                for param_group in optimizer.param_groups:
+                    param_group["lr"] = current_lr
+            if distributed_state.is_main_process and gradient_step_idx % cfg.wandb_log_freq == 0:
+                # Log the learning rate
+                # Make sure to do this AFTER any learning rate modifications (e.g., warmup/decay)
+                wandb.log(
+                    {
+                        "VLA Train/Learning Rate": scheduler.get_last_lr()[0],
+                    },
+                    step=log_step,
+                )
+            # Optimizer and LR scheduler step
+            if (batch_idx + 1) % cfg.grad_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                progress.update()
+            # Save model checkpoint: either keep latest checkpoint only or all checkpoints
+            if gradient_step_idx > 0 and log_step % cfg.save_freq == 0:
+                save_training_checkpoint(
+                    cfg=cfg,
+                    run_dir=run_dir,
+                    log_step=log_step,
+                    vla=vla,
+                    processor=processor,
+                    proprio_projector=proprio_projector if cfg.use_proprio else None,
+                    noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                    action_head=action_head if (cfg.use_l1_regression or cfg.use_diffusion) else None,
+                    train_dataset=train_dataset,
+                    distributed_state=distributed_state,
+                )
+            # Test model on validation set
+            if cfg.use_val_set and log_step > 0 and log_step % cfg.val_freq == 0:
+                run_validation(
+                    vla=vla,
+                    action_head=action_head,
+                    noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                    proprio_projector=proprio_projector if cfg.use_proprio else None,
+                    val_dataloader=val_dataloader,
+                    action_tokenizer=action_tokenizer,
+                    device_id=device_id,
+                    cfg=cfg,
+                    num_patches=NUM_PATCHES,
+                    log_step=log_step,
+                    distributed_state=distributed_state,
+                    val_time_limit=cfg.val_time_limit,
+                )
+                # Set model back to training mode after validation
+                vla.train()
+            # Stop training when max_steps is reached
+            if log_step == cfg.max_steps:
+                print(f"Max step {cfg.max_steps} reached! Stopping training...")
+                break
+if __name__ == "__main__":
+    finetune()

capvector-oft/vla-scripts/finetune_regular_loss.py ADDED Viewed

	@@ -0,0 +1,1790 @@

+#This is for the experiment of CapVector, stopping the gradient propagation in the direction of the new added vector
+"""
+finetune.py
+Fine-tunes OpenVLA via LoRA.
+"""
+import os
+import ctypes
+lib_path = "/share/miniconda3/lib/libstdc++.so.6"
+try:
+    ctypes.CDLL(lib_path)
+    print(f"Successfully preloaded {lib_path}")
+except Exception as e:
+    print(f"Failed to preload {lib_path}: {e}")
+import os
+import time
+from collections import deque
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Optional, Tuple, Type
+import draccus
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+import tqdm
+import numpy as np
+from accelerate import PartialState
+from huggingface_hub import HfApi, snapshot_download
+from peft import LoraConfig, PeftModel, get_peft_model
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import MultiStepLR, CosineAnnealingLR
+from torch.utils.data import DataLoader
+from transformers import get_cosine_schedule_with_warmup
+from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
+from transformers.modeling_outputs import CausalLMOutputWithPast
+import wandb
+os.environ["WANDB_MODE"]="offline"
+try:
+    from safetensors import safe_open
+    SAFETENSORS_AVAILABLE = True
+except ImportError:
+    SAFETENSORS_AVAILABLE = False
+    print("Warning: safetensors not available, will try torch.load instead")
+from experiments.robot.openvla_utils import (
+    check_model_logic_mismatch,
+    model_is_on_hf_hub,
+    update_auto_map,
+)
+from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
+from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+from prismatic.models.action_heads import DiffusionActionHead, L1RegressionActionHead
+from prismatic.models.backbones.llm.prompting import PurePromptBuilder
+from prismatic.models.film_vit_wrapper import FiLMedPrismaticVisionBackbone
+from prismatic.models.ema_model import EMAModel
+from prismatic.models.projectors import (
+    NoisyActionProjector,
+    ProprioProjector,
+)
+from prismatic.training.train_utils import (
+    compute_actions_l1_loss,
+    compute_token_accuracy,
+    get_current_action_mask,
+    get_next_actions_mask,
+)
+from prismatic.util.data_utils import PaddedCollatorForActionPrediction
+from prismatic.vla.action_tokenizer import ActionTokenizer
+from prismatic.vla.constants import (
+    ACTION_DIM,
+    ACTION_PROPRIO_NORMALIZATION_TYPE,
+    NUM_ACTIONS_CHUNK,
+    PROPRIO_DIM,
+)
+from prismatic.vla.datasets import RLDSBatchTransform, RLDSDataset
+from prismatic.vla.datasets.rlds.utils.data_utils import save_dataset_statistics
+# Sane Defaults
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+#wx: stop gradient in the feature vector direction
+EPS = 1e-12
+def register_orthogonal_grad_hook(model, vector_W, debug=False):
+    name_to_param = dict(model.named_parameters())
+    hooked_A = 0
+    hooked_B = 0
+    hooked_direct = 0
+    missed = 0
+    missed_name = []
+    direct_missed = 0
+    direct_missed_name = []
+    printed = {"A": False, "B": False, "D": False}
+    def proj_out(g2, v2):
+        vn2 = (v2 * v2).sum().detach()
+        if vn2.item() <= EPS:
+            return g2
+        gv = (g2 * v2).sum()
+        return g2 - (gv / (vn2 + EPS)) * v2
+    for w_name, vW in vector_W.items():
+        if "vision_backbone" in w_name:
+            continue
+        prefix = "base_model.model."
+        A_name = prefix + w_name.replace(".weight", ".lora_A.default.weight")
+        B_name = prefix + w_name.replace(".weight", ".lora_B.default.weight")
+        # ===== 1) 先尝试 LoRA hook =====
+        if A_name in name_to_param and B_name in name_to_param:
+            A = name_to_param[A_name]
+            B = name_to_param[B_name]
+            # 两个都不训练就不 hook
+            if (not A.requires_grad) and (not B.requires_grad):
+                continue
+            # vW 固定到 device/dtype
+            vW = vW.to(device=A.device, dtype=A.dtype)
+            vW2 = vW.reshape(vW.shape[0], -1) if vW.ndim != 2 else vW  # [out, in_flat]
+            # ---- hook A：动态用当前 B 计算 vA = B^T vW ----
+            if A.requires_grad:
+                def hook_A(g, A_ref=A, B_ref=B, vW2_ref=vW2):
+                    if g is None:
+                        return None
+                    g2 = g.reshape(g.shape[0], -1) if g.ndim != 2 else g
+                    B_mat = B_ref.detach()
+                    B2 = B_mat.reshape(B_mat.shape[0], -1) if B_mat.ndim != 2 else B_mat  # [out, r]
+                    if B2.shape[0] != vW2_ref.shape[0]:
+                        return g
+                    vA = torch.matmul(B2.transpose(0, 1), vW2_ref)  # [r, in_flat]
+                    if debug and not printed["A"]:
+                        print(f"[hook fired] A: ||B||={B2.norm().item():.4e}, ||vA||={vA.norm().item():.4e}, ||g||={g2.norm().item():.4e}")
+                        printed["A"] = True
+                    g2_new = proj_out(g2, vA)
+                    return g2_new.view_as(g)
+                A.register_hook(hook_A)
+                hooked_A += 1
+            # ---- hook B：动态用当前 A 计算 vB = vW A^T ----
+            if B.requires_grad:
+                def hook_B(g, A_ref=A, B_ref=B, vW2_ref=vW2):
+                    if g is None:
+                        return None
+                    g2 = g.reshape(g.shape[0], -1) if g.ndim != 2 else g
+                    A_mat = A_ref.detach()
+                    A2 = A_mat.reshape(A_mat.shape[0], -1) if A_mat.ndim != 2 else A_mat  # [r, in_flat]
+                    if A2.shape[1] != vW2_ref.shape[1]:
+                        return g
+                    vB = torch.matmul(vW2_ref, A2.transpose(0, 1))  # [out, r]
+                    if debug and not printed["B"]:
+                        print(f"[hook fired] B: ||A||={A2.norm().item():.4e}, ||vB||={vB.norm().item():.4e}, ||g||={g2.norm().item():.4e}")
+                        printed["B"] = True
+                    g2_new = proj_out(g2, vB)
+                    return g2_new.view_as(g)
+                B.register_hook(hook_B)
+                hooked_B += 1
+            # 这一轮已经成功走 LoRA 分支了
+            continue
+        # ===== 2) LoRA 不存在：fallback 到“直接参数”hook（比如 layernorm）=====
+        missed += 1
+        missed_name.append(w_name)
+        # 尝试对齐到非 LoRA 参数名
+        # 绝大多数情况下：base_model.model.<w_name>
+        direct_name = prefix + w_name
+        # 有些 vector 的命名可能不带 base_model.model，而你的模型参数名可能是别的前缀
+        # 这里给一个“再尝试一次”的备选：如果 direct_name 找不到，就尝试去掉 language_model/等前缀的情况
+        # （你也可以按自己工程实际再加规则）
+        if direct_name not in name_to_param:
+            # 再试一次：如果 w_name 本身已经含 base_model.model 就不加 prefix
+            if w_name in name_to_param:
+                direct_name = w_name
+            else:
+                direct_missed += 1
+                direct_missed_name.append(w_name)
+                continue
+        P = name_to_param[direct_name]
+        if not P.requires_grad:
+            # 找到了但不训练：不 hook，也不算 direct_missed
+            continue
+        vP = vector_W[w_name].to(device=P.device, dtype=P.dtype)
+        vP2 = vP.reshape(vP.shape[0], -1) if vP.ndim != 2 else vP
+        def hook_direct(g, v_ref=vP2):
+            if g is None:
+                return None
+            g2 = g.reshape(g.shape[0], -1) if g.ndim != 2 else g
+            # shape 不匹配就不动（避免 hook 改尺寸报错）
+            if g2.shape != v_ref.shape:
+                return g
+            if debug and not printed["D"]:
+                print(f"[hook fired] Direct: param={direct_name}, ||v||={v_ref.norm().item():.4e}, ||g||={g2.norm().item():.4e}")
+                printed["D"] = True
+            g2_new = proj_out(g2, v_ref)
+            return g2_new.view_as(g)
+        P.register_hook(hook_direct)
+        hooked_direct += 1
+    print(
+        f"[hook summary] hooked lora_A: {hooked_A}, lora_B: {hooked_B}, direct: {hooked_direct}, "
+        f"missed(lora-not-found): {missed}, direct_missed: {direct_missed}"
+    )
+    # 如果你想看具体 miss 列表：
+    # print("[missed lora-not-found names]")
+    # for n in missed_name: print("  -", n)
+    # print("[direct_missed names]")
+    # for n in direct_missed_name: print("  -", n)
+    # import pdb; pdb.set_trace()
+# def register_orthogonal_grad_hook(model, vector_W, debug=False):
+#     name_to_param = dict(model.named_parameters())
+#     hooked_A = 0
+#     hooked_B = 0
+#     missed = 0
+#     printed = {"A": False, "B": False}  # 用于只打印一次
+#     for w_name, vW in vector_W.items():
+#         if "vision_backbone" in w_name:
+#             continue
+#         # import pdb; pdb.set_trace()
+#         prefix = "base_model.model."
+#         A_name = prefix + w_name.replace(".weight", ".lora_A.default.weight")
+#         B_name = prefix + w_name.replace(".weight", ".lora_B.default.weight")
+#         if A_name not in name_to_param or B_name not in name_to_param:
+#             missed += 1
+#             continue
+#         A = name_to_param[A_name]
+#         B = name_to_param[B_name]
+#         if (not A.requires_grad) and (not B.requires_grad):
+#             continue
+#         vW = vW.to(device=A.device, dtype=A.dtype)
+#         with torch.no_grad():
+#             # A_mat = A.detach().view(1, -1)          # (1, in)
+#             # B_mat = B.detach().view(-1, 1)          # (out,1)
+#             # vA = torch.matmul(B_mat.T, vW)          # (1,in)
+#             # vB = torch.matmul(vW, A_mat.T)          # (out,1)
+#             B_mat = B.detach()
+#             A_mat = A.detach()
+#             # import pdb; pdb.set_trace()
+#             # 统一把 vW 变成二维: [out, in_flat]
+#             if vW.ndim != 2:
+#                 vW2 = vW.reshape(vW.shape[0], -1)
+#             else:
+#                 vW2 = vW
+#             # A 也可能不是严格二维（一般是二维，但保险起见）#看了一下AB都是二维
+#             if A_mat.ndim != 2:
+#                 A2 = A_mat.reshape(A_mat.shape[0], -1)   # [r, in_flat]
+#             else:
+#                 A2 = A_mat
+#             # B 通常是二维 [out, r]
+#             if B_mat.ndim != 2:
+#                 B2 = B_mat.reshape(B_mat.shape[0], -1)   # [out, r]
+#             else:
+#                 B2 = B_mat
+#             # 形状校验：不匹配就跳过这个 w_name（避免再报错）
+#             # 需要：B2: [out, r]  与 vW2: [out, in_flat] 的 out 对齐
+#             # 需要：A2: [r, in_flat] 与 vW2: [out, in_flat] 的 in_flat 对齐
+#             if B2.shape[0] != vW2.shape[0] or A2.shape[1] != vW2.shape[1] or A2.shape[0] != B2.shape[1]:
+#                 missed += 1
+#                 continue
+#             vA = torch.matmul(B2.transpose(0, 1), vW2)   # [r, in_flat]
+#             vB = torch.matmul(vW2, A2.transpose(0, 1))   # [out, r]
+#         # hook A
+#         if A.requires_grad:
+#             vA_norm2 = (vA * vA).sum().detach()
+#             if vA_norm2.item() > EPS:
+#                 def make_hook_A(v, vn2):
+#                     def hook(g):
+#                         if debug and not printed["A"]:
+#                             print(f"[hook fired] lora_A grad norm: {g.norm().item():.4e}")
+#                             printed["A"] = True
+#                         gv = (g * v).sum()
+#                         proj = (gv / (vn2 + EPS)) * v
+#                         return g - proj
+#                     return hook
+#                 A.register_hook(make_hook_A(vA, vA_norm2))
+#                 hooked_A += 1
+#         # hook B
+#         if B.requires_grad:
+#             vB_norm2 = (vB * vB).sum().detach()
+#             if vB_norm2.item() > EPS:
+#                 def make_hook_B(v, vn2):
+#                     def hook(g):
+#                         if debug and not printed["B"]:
+#                             print(f"[hook fired] lora_B grad norm: {g.norm().item():.4e}")
+#                             printed["B"] = True
+#                         gv = (g * v).sum()
+#                         proj = (gv / (vn2 + EPS)) * v
+#                         return g - proj
+#                     return hook
+#                 B.register_hook(make_hook_B(vB, vB_norm2))
+#                 hooked_B += 1
+#     print(f"[hook summary] hooked lora_A: {hooked_A}, hooked lora_B: {hooked_B}, missed: {missed}")
+#     import pdb; pdb.set_trace()
+# 用法：
+# vector_sd = torch.load("your_vector.pth")["state_dict"] or similar
+# register_orthogonal_grad_hook(model, vector_sd)
+# import debugpy
+# try:
+#     debugpy.listen(("localhost", 9501))
+#     print("Waiting for debugger attach")
+#     debugpy.wait_for_client()
+# except Exception as e:
+#     pass
+@dataclass
+class FinetuneConfig:
+    # fmt: off
+    vla_path: str = "openvla/openvla-7b"             # Path to OpenVLA model (on HuggingFace Hub or stored locally)
+    # Dataset
+    data_root_dir: Path = Path("datasets/rlds")      # Directory containing RLDS datasets
+    dataset_name: str = "aloha_scoop_x_into_bowl"    # Name of fine-tuning dataset (e.g., `aloha_scoop_x_into_bowl`)
+    run_root_dir: Path = Path("runs")                # Path to directory to store logs & checkpoints
+    shuffle_buffer_size: int = 100_000               # Dataloader shuffle buffer size (can reduce if OOM errors occur)
+    # Algorithm and architecture
+    use_l1_regression: bool = True                   # If True, trains continuous action head with L1 regression objective
+    use_diffusion: bool = False                      # If True, trains continuous action head with diffusion modeling objective (DDIM)
+    num_diffusion_steps_train: int = 50              # (When `diffusion==True`) Number of diffusion steps used for training
+    use_film: bool = False                           # If True, uses FiLM to infuse language inputs into visual features
+    num_images_in_input: int = 1                     # Number of images in the VLA input (default: 1)
+    use_proprio: bool = False                        # If True, includes robot proprioceptive state in input
+    # Training configuration
+    batch_size: int = 8                              # Batch size per device (total batch size = batch_size * num GPUs)
+    learning_rate: float = 5e-4                      # Learning rate
+    lr_warmup_steps: int = 0                         # Number of steps to warm up learning rate (from 10% to 100%)
+    num_steps_before_decay: int = 100_000            # Number of steps before LR decays by 10x
+    grad_accumulation_steps: int = 1                 # Number of gradient accumulation steps
+    max_steps: int = 200_000                         # Max number of training steps
+    use_val_set: bool = False                        # If True, uses validation set and log validation metrics
+    val_freq: int = 10_000                           # (When `use_val_set==True`) Validation set logging frequency in steps
+    val_time_limit: int = 180                        # (When `use_val_set==True`) Time limit for computing validation metrics
+    save_freq: int = 10_000                          # Checkpoint saving frequency in steps
+    save_latest_checkpoint_only: bool = False        # If True, saves only 1 checkpoint, overwriting latest checkpoint
+                                                     #   (If False, saves all checkpoints)
+    scheduler: str = 'MultiStepLR'                   # "MultiStepLR" or "CosineAnnealingLR" or "WarmupCosineLR"
+    resume: bool = False                             # If True, resumes from checkpoint
+    resume_step: Optional[int] = None                # (When `resume==True`) Step number that we are resuming from
+    image_aug: bool = True                           # If True, trains with image augmentations (HIGHLY RECOMMENDED)
+    diffusion_sample_freq: int = 50                  # (When `use_diffusion==True`) Frequency for sampling in steps
+    # LoRA
+    use_lora: bool = True                            # If True, uses LoRA fine-tuning
+    lora_rank: int = 32                              # Rank of LoRA weight matrix
+    lora_dropout: float = 0.0                        # Dropout applied to LoRA weights
+    merge_lora_during_training: bool = True          # If True, merges LoRA weights and saves result during training
+                                                     #   Note: Merging can be very slow on some machines. If so, set to
+                                                     #         False and merge final checkpoint offline!
+    # Regularization
+    regularization_lora_vector_path: str = None            # Path to regularization vector
+    regularization_weight: float = 1e-3              # Weight of regularization loss
+    # Logging
+    wandb_entity: str = "your-wandb-entity"          # Name of WandB entity
+    wandb_project: str = "your-wandb-project"        # Name of WandB project
+    run_id_note: Optional[str] = None                # Extra note to add to end of run ID for logging
+    run_id_override: Optional[str] = None            # Optional string to override the run ID with
+    wandb_log_freq: int = 10                         # WandB logging frequency in steps
+    # EMA
+    use_ema: bool = False                           # If True, maintains an EMA copy of the model
+    inv_gamma: float = 1                            # EMA inverse gamma parameter
+    # fmt: on
+def remove_ddp_in_checkpoint(state_dict) -> dict:
+    """
+    Removes the 'module.' prefix from parameter names in a PyTorch model state dictionary that was saved using
+    DistributedDataParallel (DDP).
+    When a model is trained using PyTorch's DistributedDataParallel, the saved state dictionary contains parameters
+    prefixed with 'module.'. This function removes these prefixes to make the state dictionary compatible when
+    loading into models that are not yet wrapped in DDP.
+    Args:
+        state_dict (dict): PyTorch model state dictionary.
+    Returns:
+        dict: A new state dictionary with the same contents but with 'module.' prefixes removed from parameter names.
+              Parameters without the 'module.' prefix remain unchanged.
+    """
+    new_state_dict = {}
+    for k, v in state_dict.items():
+        if k[:7] == "module.":
+            new_state_dict[k[7:]] = v
+        else:
+            new_state_dict[k] = v
+    return new_state_dict
+def get_run_id(cfg) -> str:
+    """
+    Generates or retrieves an identifier string for an experiment run.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+    Returns:
+        str: Experiment run ID.
+    """
+    if cfg.run_id_override is not None:
+        # Override the run ID with the user-provided ID
+        run_id = cfg.run_id_override
+    elif cfg.resume:
+        # Override run ID with the previous resumed run's ID
+        run_id = cfg.vla_path.split("/")[-1]
+        # Remove the "--XXX_chkpt" suffix from the run ID if it exists
+        if "chkpt" in run_id.split("--")[-1]:
+            run_id = "--".join(run_id.split("--")[:-1])
+    else:
+        run_id = (
+            f"{cfg.vla_path.split('/')[-1]}+{cfg.dataset_name}"
+            f"+b{cfg.batch_size * cfg.grad_accumulation_steps}"
+            f"+lr-{cfg.learning_rate}"
+        )
+        if cfg.use_lora:
+            run_id += f"+lora-r{cfg.lora_rank}+dropout-{cfg.lora_dropout}"
+        if cfg.image_aug:
+            run_id += "--image_aug"
+        if cfg.run_id_note is not None:
+            run_id += f"--{cfg.run_id_note}"
+    return run_id
+def load_checkpoint(module_name: str, path: str, step: int, device: str = "cpu") -> dict:
+    """
+    Loads a checkpoint for a given module.
+    Args:
+        module_name (str): Name of model component to load checkpoint for.
+        path (str): Path to checkpoint directory.
+        step (int): Gradient step number of saved checkpoint.
+        device (str): String specifying how to remap storage locations (default = "cpu").
+    Returns:
+        dict: PyTorch model state dictionary.
+    """
+    checkpoint_path = os.path.join(path, f"{module_name}--{step}_checkpoint.pt")
+    print(f"Loading checkpoint: {checkpoint_path}")
+    state_dict = torch.load(checkpoint_path, weights_only=True, map_location=device)
+    return remove_ddp_in_checkpoint(state_dict)
+def wrap_ddp(module: nn.Module, device_id: int, find_unused: bool = False) -> DDP:
+    """
+    Wrap a module with DistributedDataParallel.
+    Args:
+        module (nn.Module): PyTorch module.
+        device_id (str): Device ID.
+        find_unused (bool): Whether to detect parameters without gradients in distributed training.
+    Returns:
+        DistributedDataParallel: PyTorch module wrapped with DDP.
+    """
+    return DDP(module, device_ids=[device_id], find_unused_parameters=find_unused, gradient_as_bucket_view=True)
+def count_parameters(module: nn.Module, name: str) -> None:
+    """
+    Counts and prints the number of trainable parameters in a module.
+    Args:
+        module (nn.Module): PyTorch module.
+        module_name (str): Name of model component.
+    Returns:
+        None.
+    """
+    num_params = sum(p.numel() for p in module.parameters() if p.requires_grad)
+    print(f"# trainable params in {name}: {num_params}")
+def init_module(
+    module_class: Type[nn.Module],
+    module_name: str,
+    cfg: FinetuneConfig,
+    device_id: int,
+    module_args: dict,
+    to_bf16: bool = False,
+    find_unused_params: bool = False,
+) -> DDP:
+    """
+    Initializes a module, optionally loads checkpoint, moves to device, and wraps with DDP.
+    Args:
+        module_class (Type[nn.Module]): Class of PyTorch module to initialize.
+        module_name (str): Name of model component to load checkpoint for.
+        cfg (FinetuneConfig): Training configuration.
+        device_id (str): Device ID.
+        module_args (dict): Args for initializing the module.
+        to_bf16 (bool): Whether to convert to torch.bfloat16 data type.
+        find_unused_params (bool): Whether to detect parameters without gradients in distributed training.
+    Returns:
+        DistributedDataParallel: PyTorch module wrapped with DDP.
+    """
+    module = module_class(**module_args)
+    count_parameters(module, module_name)
+    if cfg.resume:
+        state_dict = load_checkpoint(module_name, cfg.vla_path, cfg.resume_step)
+        module.load_state_dict(state_dict)
+    if to_bf16:
+        module = module.to(torch.bfloat16)
+    module = module.to(device_id)
+    return wrap_ddp(module, device_id, find_unused_params)
+def run_forward_pass(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    batch,
+    action_tokenizer,
+    device_id,
+    use_l1_regression,
+    use_diffusion,
+    use_proprio,
+    use_film,
+    num_patches,
+    compute_diffusion_l1=False,
+    num_diffusion_steps_train=None,
+) -> Tuple[torch.Tensor, Dict[str, float]]:
+    """
+    Compute model forward pass and metrics for both training and validation.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        batch (dict): Input batch.
+        action_tokenizer (ActionTokenizer): Action tokenizer.
+        device_id (str): Device ID.
+        use_l1_regression (bool): Whether to use L1 regression.
+        use_diffusion (bool): Whether to use diffusion.
+        use_proprio (bool): Whether to use proprioceptive state as input.
+        use_film (bool): Whether to use FiLM for better language following.
+        num_patches (int): Number of vision patches.
+        compute_diffusion_l1 (bool): Whether to sample actions and compute L1 loss for diffusion (do this once every
+                                    diffusion_sample_freq steps during training; do it every batch for validation)
+        num_diffusion_steps_train (int): Number of diffusion steps for training (only used for diffusion).
+    Returns:
+        tuple: (loss, metrics_dict)
+            loss: The loss tensor with gradient for backpropagation.
+            metrics_dict: Dictionary of computed metrics (detached values for logging).
+    """
+    metrics = {}
+    # Get ground-truth action labels
+    ground_truth_actions = batch["actions"].to(device_id).to(torch.bfloat16)
+    # [Only for diffusion] Sample noisy actions used as input for noise predictor network
+    if use_diffusion:
+        noisy_dict = action_head.module.sample_noisy_actions(ground_truth_actions)
+        noise, noisy_actions, diffusion_timestep_embeddings = (
+            noisy_dict["noise"],
+            noisy_dict["noisy_actions"],
+            noisy_dict["diffusion_timestep_embeddings"],
+        )
+    else:
+        noise, noisy_actions, diffusion_timestep_embeddings = None, None, None
+    # VLA forward pass
+    with torch.autocast("cuda", dtype=torch.bfloat16):
+        output: CausalLMOutputWithPast = vla(
+            input_ids=batch["input_ids"].to(device_id),
+            attention_mask=batch["attention_mask"].to(device_id),
+            pixel_values=batch["pixel_values"].to(torch.bfloat16).to(device_id),
+            labels=batch["labels"],
+            output_hidden_states=True,
+            proprio=batch["proprio"] if use_proprio else None,
+            proprio_projector=proprio_projector if use_proprio else None,
+            noisy_actions=noisy_actions if use_diffusion else None,
+            noisy_action_projector=noisy_action_projector if use_diffusion else None,
+            diffusion_timestep_embeddings=diffusion_timestep_embeddings if use_diffusion else None,
+            use_film=use_film,
+        )
+    # Get action masks needed for logging
+    ground_truth_token_ids = batch["labels"][:, 1:].to(device_id)
+    current_action_mask = get_current_action_mask(ground_truth_token_ids)
+    next_actions_mask = get_next_actions_mask(ground_truth_token_ids)
+    # Compute metrics for discrete action representation (next-token prediction)
+    if not (use_l1_regression or use_diffusion):
+        loss = output.loss
+        predicted_token_ids = output.logits[:, num_patches:-1].argmax(dim=2)
+        curr_action_accuracy = compute_token_accuracy(
+            predicted_token_ids, ground_truth_token_ids, mask=current_action_mask
+        )
+        curr_action_l1_loss = compute_actions_l1_loss(
+            action_tokenizer, predicted_token_ids, ground_truth_token_ids, mask=current_action_mask
+        )
+        next_actions_accuracy = compute_token_accuracy(
+            predicted_token_ids, ground_truth_token_ids, mask=next_actions_mask
+        )
+        next_actions_l1_loss = compute_actions_l1_loss(
+            action_tokenizer, predicted_token_ids, ground_truth_token_ids, mask=next_actions_mask
+        )
+        metrics.update(
+            {
+                "loss_value": loss.item(),  # Detached value for logging
+                "curr_action_accuracy": curr_action_accuracy.item(),
+                "curr_action_l1_loss": curr_action_l1_loss.item(),
+                "next_actions_accuracy": next_actions_accuracy.item(),
+                "next_actions_l1_loss": next_actions_l1_loss.item(),
+            }
+        )
+    # Compute metrics for continuous action representations (L1 regression | diffusion)
+    else:
+        # Get last layer hidden states
+        last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
+        # Get hidden states for text portion of prompt+response (after the vision patches)
+        text_hidden_states = last_hidden_states[:, num_patches:-1]
+        # Get hidden states for action portion of response
+        batch_size = batch["input_ids"].shape[0]
+        actions_hidden_states = (
+            text_hidden_states[current_action_mask | next_actions_mask]
+            .reshape(batch_size, NUM_ACTIONS_CHUNK * ACTION_DIM, -1)
+            .to(torch.bfloat16)
+        )  # (B, act_chunk_len, D)
+        if use_l1_regression:
+            # Predict action
+            predicted_actions = action_head.module.predict_action(actions_hidden_states)
+            # Get full L1 loss
+            loss = torch.nn.L1Loss()(ground_truth_actions, predicted_actions)
+        if use_diffusion:
+            # Predict noise
+            noise_pred = action_head.module.predict_noise(actions_hidden_states)
+            # Get diffusion noise prediction MSE loss
+            noise_pred = noise_pred.reshape(noise.shape)
+            loss = nn.functional.mse_loss(noise_pred, noise, reduction="mean")
+            # Only sample actions and compute L1 losses if specified
+            if compute_diffusion_l1:
+                with torch.no_grad():
+                    predicted_actions = run_diffusion_sampling(
+                        vla=vla,
+                        action_head=action_head,
+                        noisy_action_projector=noisy_action_projector,
+                        proprio_projector=proprio_projector,
+                        batch=batch,
+                        batch_size=batch_size,
+                        num_patches=num_patches,
+                        actions_shape=ground_truth_actions.shape,
+                        device_id=device_id,
+                        current_action_mask=current_action_mask,
+                        next_actions_mask=next_actions_mask,
+                        use_proprio=use_proprio,
+                        use_film=use_film,
+                    )
+        metrics.update(
+            {
+                "loss_value": loss.item(),  # Detached value for logging
+            }
+        )
+        # Get detailed L1 losses for logging
+        should_log_l1_loss = not use_diffusion or (use_diffusion and compute_diffusion_l1)
+        if should_log_l1_loss:
+            ground_truth_curr_action = ground_truth_actions[:, 0]
+            predicted_curr_action = predicted_actions[:, 0]
+            ground_truth_next_actions = ground_truth_actions[:, 1:]
+            predicted_next_actions = predicted_actions[:, 1:]
+            curr_action_l1_loss = torch.nn.L1Loss()(ground_truth_curr_action, predicted_curr_action)
+            next_actions_l1_loss = torch.nn.L1Loss()(ground_truth_next_actions, predicted_next_actions)
+            metrics.update(
+                {
+                    "curr_action_l1_loss": curr_action_l1_loss.item(),
+                    "next_actions_l1_loss": next_actions_l1_loss.item(),
+                }
+            )
+    # Return both the loss tensor (with gradients) and the metrics dictionary (with detached values)
+    return loss, metrics
+def run_diffusion_sampling(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    batch,
+    batch_size,
+    num_patches,
+    actions_shape,
+    device_id,
+    current_action_mask,
+    next_actions_mask,
+    use_proprio,
+    use_film,
+) -> torch.Tensor:
+    """
+    Run diffusion sampling (reverse diffusion) to generate actions.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        batch (dict): Input batch.
+        batch_size (int): Batch size.
+        num_patches (int): Number of vision patches.
+        actions_shape (tuple): Shape of ground-truth actions.
+        device_id (str): Device ID.
+        current_action_mask (torch.Tensor): Mask for current action.
+        next_actions_mask (torch.Tensor): Mask for next actions.
+        use_proprio (bool): Whether to use proprioceptive state as input.
+        use_film (bool): Whether to use FiLM for better language following.
+    Returns:
+        torch.Tensor: Predicted actions.
+    """
+    # Sample random noisy action, used as the starting point for reverse diffusion
+    noise = torch.randn(
+        size=(batch_size, NUM_ACTIONS_CHUNK, ACTION_DIM),
+        device=device_id,
+        dtype=torch.bfloat16,
+    )  # (B, chunk_len, action_dim)
+    # Set diffusion timestep values
+    action_head.module.noise_scheduler.set_timesteps(action_head.module.num_diffusion_steps_train)
+    # Reverse diffusion: Iteratively denoise to generate action, conditioned on observation
+    curr_noisy_actions = noise
+    for t in action_head.module.noise_scheduler.timesteps:
+        # Get diffusion model's noise prediction (conditioned on VLA latent embedding, current noisy action embedding,
+        # and diffusion timestep embedding)
+        timesteps = torch.Tensor([t]).repeat(batch_size).to(device_id)
+        diffusion_timestep_embeddings = (
+            action_head.module.time_encoder(timesteps).to(curr_noisy_actions.dtype).to(curr_noisy_actions.device)
+        )  # (B, llm_dim)
+        diffusion_timestep_embeddings = diffusion_timestep_embeddings.unsqueeze(1)  # (B, 1, llm_dim)
+        with torch.autocast("cuda", dtype=torch.bfloat16):
+            output = vla(
+                input_ids=batch["input_ids"].to(device_id),
+                attention_mask=batch["attention_mask"].to(device_id),
+                pixel_values=batch["pixel_values"].to(torch.bfloat16).to(device_id),
+                labels=batch["labels"],
+                output_hidden_states=True,
+                proprio=batch["proprio"] if use_proprio else None,
+                proprio_projector=proprio_projector if use_proprio else None,
+                noisy_actions=curr_noisy_actions,
+                noisy_action_projector=noisy_action_projector,
+                diffusion_timestep_embeddings=diffusion_timestep_embeddings,
+                use_film=use_film,
+            )
+            # Get last layer hidden states
+            last_hidden_states = output.hidden_states[-1]  # (B, seq_len, D)
+            # Get hidden states for text portion of prompt+response (after the vision patches)
+            text_hidden_states = last_hidden_states[:, num_patches:-1]
+            # Get hidden states for action portion of response
+            actions_hidden_states = text_hidden_states[current_action_mask | next_actions_mask].reshape(
+                batch_size, NUM_ACTIONS_CHUNK * ACTION_DIM, -1
+            )  # (B, act_chunk_len, D)
+            actions_hidden_states = actions_hidden_states.to(torch.bfloat16)
+            # Predict noise
+            noise_pred = action_head.module.predict_noise(actions_hidden_states)
+        # Compute the action at the previous diffusion timestep: x_t -> x_{t-1}
+        curr_noisy_actions = action_head.module.noise_scheduler.step(noise_pred, t, curr_noisy_actions).prev_sample
+    return curr_noisy_actions.reshape(actions_shape)
+def compute_smoothened_metrics(metrics_deques) -> dict:
+    """
+    Compute smoothened metrics from recent deques.
+    Args:
+        metrics_deques (dict): Dictionary of deques containing recent metrics.
+    Returns:
+        dict: Dictionary of smoothened metrics.
+    """
+    smoothened_metrics = {}
+    for name, deque in metrics_deques.items():
+        if deque and len(deque) > 0:
+            smoothened_metrics[name] = sum(deque) / len(deque)
+    return smoothened_metrics
+def compute_diff_regularization_loss(model, diff_params_dict, regularization_weight=1.0):
+    """
+    计算模型参数和diff_path中同名参数之间的正则化loss，用于防止模型参数向diff_path参数的方向更新。
+    参考正交化loss的实现方式，计算参数之间的内积来惩罚相似性。
+    Args:
+        model: 模型（可能是DDP包装的）
+        diff_params_dict: 从diff_path加载的参数字典
+        regularization_weight: 正则化权重
+    Returns:
+        regularization_loss: 正则化loss值
+    """
+    orthogonal_loss = 0.
+    matched_count = 0
+    # 获取模型的实际模块（如果是DDP包装的）
+    model_module = model.module if hasattr(model, 'module') else model
+    for name, param in model_module.named_parameters():
+        if "lora" in name:
+            if not param.requires_grad:
+                continue
+            # 尝试匹配diff_params_dict中的同名参数
+            # 需要处理可能的命名差异：
+            # 1. diff_path中可能没有"base_model.model."前缀
+            # 2. diff_path中可能在.lora_A或.lora_B后多了一个".default"
+            #    例如：model中是 "xxx.lora_A.weight"
+            #         diff中是 "xxx.lora_A.default.weight"
+            matched_diff_param = None
+            # 首先尝试直接匹配
+            if name in diff_params_dict:
+                import pdb; pdb.set_trace()
+                matched_diff_param = diff_params_dict[name]
+            else:
+                # import pdb; pdb.set_trace()
+                # 尝试处理".default"的差异：在.lora_A或.lora_B后添加.default
+                # follow o-lora只约束lora_A的参数
+                if ".lora_A." in name:
+                    name_with_default = name.replace(".lora_A.default.", ".lora_A.")
+                    if name_with_default in diff_params_dict:
+                        matched_diff_param = diff_params_dict[name_with_default]
+                # elif ".lora_B." in name:
+                #     name_with_default = name.replace(".lora_B.default.", ".lora_B.")
+                #     if name_with_default in diff_params_dict:
+                #         matched_diff_param = diff_params_dict[name_with_default]
+            if matched_diff_param is not None:
+                # print(f"匹配到参数: {name}")
+                # 确保参数在同一个设备上
+                diff_param = matched_diff_param.to(device=param.device, dtype=param.dtype)
+                # 检查形状是否匹配
+                if param.shape == diff_param.shape:
+                    # 使用detach().clone().requires_grad_()来避免DDP的重复标记问题
+                    # 这会创建一个新的tensor，保持梯度连接，但不会触发DDP的重复标记
+                    param_safe = param.clone()
+                    diff_param_safe = diff_param.detach().clone()
+                    # 对于视觉模型内的多维lora参数
+                    param_flat = param_safe.reshape(-1)  # [N]
+                    diff_param_flat = diff_param_safe.reshape(-1)  # [N]
+                    inner_product = torch.abs((param_flat * diff_param_flat).sum())
+                    orthogonal_loss += inner_product
+                    matched_count += 1
+                    # print(f"匹配到参数: {name} 的正则化loss: {inner_product}")
+    # print(f"正则化loss: {orthogonal_loss}")
+    if matched_count > 0:
+        orthogonal_loss = orthogonal_loss * regularization_weight
+    else:
+        # 如果没有匹配的参数，返回0（需要梯度，这样在backward时不会报错）
+        # 但实际梯度为0，所以不会影响训练
+        device = next(model_module.parameters()).device
+        orthogonal_loss = torch.tensor(0.0, device=device, requires_grad=True)
+    return orthogonal_loss
+def load_diff_params(diff_path, device="cpu"):
+    """
+    从safetensors或pth文件加载参数。
+    Args:
+        diff_path: 参数文件路径
+        device: 加载到的设备
+    Returns:
+        diff_params_dict: 参数字典
+    """
+    diff_params_dict = {}
+    if diff_path.endswith('.safetensors'):
+        if not SAFETENSORS_AVAILABLE:
+            raise ImportError("safetensors library is required to load .safetensors files")
+        with safe_open(diff_path, framework="pt", device=device) as f:
+            for key in f.keys():
+                diff_params_dict[key] = f.get_tensor(key)
+    else:
+        # 假设是pth或其他torch格式
+        loaded = torch.load(diff_path, map_location=device)
+        if isinstance(loaded, dict):
+            if "state_dict" in loaded:
+                diff_params_dict = loaded["state_dict"]
+            else:
+                diff_params_dict = loaded
+        else:
+            diff_params_dict = loaded
+    return diff_params_dict
+def log_metrics_to_wandb(metrics, prefix, step, wandb_entity) -> None:
+    """
+    Log metrics to Weights & Biases.
+    Args:
+        metrics (dict): Dictionary of metrics to log
+        prefix (str): Prefix for metric names
+        step (int): Training step
+        wandb_entity (str): W&B entity instance
+    Returns:
+        None.
+    """
+    log_dict = {}
+    for name, value in metrics.items():
+        # Map loss_value to Loss for better readability in W&B
+        if name == "loss_value":
+            log_dict[f"{prefix}/Loss"] = value
+        # Keep other metrics as is
+        else:
+            log_dict[f"{prefix}/{name.replace('_', ' ').title()}"] = value
+    wandb_entity.log(log_dict, step=step)
+def save_training_checkpoint(
+    cfg,
+    run_dir,
+    log_step,
+    vla,
+    processor,
+    proprio_projector,
+    noisy_action_projector,
+    action_head,
+    train_dataset,
+    distributed_state,
+) -> None:
+    """
+    Save all training checkpoints including model components, LoRA adapter, and dataset statistics.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+        run_dir (Path): Experiment run directory path.
+        log_step (int): Current logging step.
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        processor (PrismaticProcessor): OpenVLA inputs processor.
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        action_head (nn.Module): Action head module.
+        train_dataset (RLDSDataset): Training dataset.
+        distributed_state (PartialState): Distributed training state.
+    Returns:
+        None.
+    """
+    # Determine checkpoint paths and naming
+    if cfg.save_latest_checkpoint_only:
+        checkpoint_dir = run_dir
+        checkpoint_name_suffix = "latest_checkpoint.pt"
+    else:
+        checkpoint_dir = run_dir / f"{log_step}_chkpt"
+        checkpoint_name_suffix = f"{log_step}_checkpoint.pt"
+    adapter_dir = checkpoint_dir / "lora_adapter"
+    # Create directories and save dataset statistics (main process only)
+    if distributed_state.is_main_process:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        os.makedirs(adapter_dir, exist_ok=True)
+        save_dataset_statistics(train_dataset.dataset_statistics, checkpoint_dir)
+        print(f"Saving Model Checkpoint for Step {log_step}")
+    # Wait for directories to be created
+    dist.barrier()
+    # Save model components (main process only)
+    if distributed_state.is_main_process:
+        # Save processor and LoRA adapter
+        processor.save_pretrained(checkpoint_dir)
+        vla.module.save_pretrained(adapter_dir)
+        # Save other components
+        if cfg.use_proprio and proprio_projector is not None:
+            torch.save(proprio_projector.state_dict(), checkpoint_dir / f"proprio_projector--{checkpoint_name_suffix}")
+        if cfg.use_diffusion and noisy_action_projector is not None:
+            torch.save(
+                noisy_action_projector.state_dict(), checkpoint_dir / f"noisy_action_projector--{checkpoint_name_suffix}"
+            )
+        if (cfg.use_l1_regression or cfg.use_diffusion) and action_head is not None:
+            torch.save(action_head.state_dict(), checkpoint_dir / f"action_head--{checkpoint_name_suffix}")
+        if cfg.use_film:
+            # To be safe, just save the entire vision backbone (not just FiLM components)
+            torch.save(
+                vla.module.vision_backbone.state_dict(), checkpoint_dir / f"vision_backbone--{checkpoint_name_suffix}"
+            )
+    # Wait for model components to be saved
+    dist.barrier()
+    # Merge LoRA weights into base model and save resulting model checkpoint
+    # Note: Can be very slow on some devices; if so, we recommend merging offline
+    if cfg.use_lora and cfg.merge_lora_during_training:
+        base_vla = AutoModelForVision2Seq.from_pretrained(
+            cfg.vla_path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True
+        )
+        merged_vla = PeftModel.from_pretrained(base_vla, adapter_dir)
+        merged_vla = merged_vla.merge_and_unload()
+        if distributed_state.is_main_process:
+            merged_vla.save_pretrained(checkpoint_dir)
+            print(f"Saved merged model for Step {log_step} at: {checkpoint_dir}")
+        # Wait for merged model to be saved
+        dist.barrier()
+def run_validation(
+    vla,
+    action_head,
+    noisy_action_projector,
+    proprio_projector,
+    val_dataloader,
+    action_tokenizer,
+    device_id,
+    cfg,
+    num_patches,
+    log_step,
+    distributed_state,
+    val_time_limit,
+) -> None:
+    """
+    Compute validation set metrics for logging.
+    Args:
+        vla (OpenVLAForActionPrediction): Vision-language-action policy.
+        action_head (nn.Module): Action head module.
+        noisy_action_projector (nn.Module): Noisy action projector module (only used for diffusion).
+        proprio_projector (nn.Module): Proprioceptive state projector module.
+        val_dataloader (DataLoader): Validation data loader.
+        action_tokenizer (ActionTokenizer): Action tokenizer.
+        device_id (str): Device ID.
+        cfg (FinetuneConfig): Training configuration.
+        num_patches (int): Number of vision patches.
+        log_step (int): Current logging step.
+        distributed_state (PartialState): Distributed training state.
+        val_time_limit (int): Time limit for computing validation metrics.
+    Returns:
+        None.
+    """
+    val_start_time = time.time()
+    vla.eval()
+    val_batches_count = 0
+    # List to store validation metrics
+    all_val_metrics = []
+    with torch.no_grad():
+        for batch in val_dataloader:
+            # Always compute L1 loss for validation, even for diffusion
+            _, metrics = run_forward_pass(
+                vla=vla,
+                action_head=action_head,
+                noisy_action_projector=noisy_action_projector,
+                proprio_projector=proprio_projector,
+                batch=batch,
+                action_tokenizer=action_tokenizer,
+                device_id=device_id,
+                use_l1_regression=cfg.use_l1_regression,
+                use_diffusion=cfg.use_diffusion,
+                use_proprio=cfg.use_proprio,
+                use_film=cfg.use_film,
+                num_patches=num_patches,
+                compute_diffusion_l1=True,
+                num_diffusion_steps_train=cfg.num_diffusion_steps_train if cfg.use_diffusion else None,
+            )
+            # Add the loss value to the metrics
+            metrics["loss"] = metrics["loss_value"]
+            all_val_metrics.append(metrics)
+            val_batches_count += 1
+            # Cut testing on validation set short if it exceeds time limit
+            if time.time() - val_start_time > val_time_limit:
+                break
+    # Compute average validation metrics
+    avg_val_metrics = {}
+    for metric_name in all_val_metrics[0].keys():
+        values = [metrics[metric_name] for metrics in all_val_metrics if metric_name in metrics]
+        if values:
+            avg_val_metrics[metric_name] = sum(values) / len(values)
+    # Add batch count to metrics
+    avg_val_metrics["val_batches_count"] = val_batches_count
+    # Log validation metrics to W&B
+    if distributed_state.is_main_process:
+        log_metrics_to_wandb(avg_val_metrics, "VLA Val", log_step, wandb)
+@draccus.wrap()
+def finetune(cfg: FinetuneConfig) -> None:
+    """
+    Fine-tunes base VLA on demonstration dataset via LoRA.
+    Allows toggling different action representations (discrete vs. continuous), different learning objectives
+    (next-token prediction vs. L1 regression vs. diffusion), FiLM. Also allows for additional model inputs,
+    such as additional camera images and robot proprioceptive state. Assumes parallel action generation with
+    action chunking.
+    Args:
+        cfg (FinetuneConfig): Training configuration.
+    Returns:
+        None.
+    """
+    assert cfg.use_lora, "Only LoRA fine-tuning is supported. Please set --use_lora=True!"
+    assert not (cfg.use_l1_regression and cfg.use_diffusion), (
+        "Cannot do both L1 regression and diffusion. Please pick one of them!"
+    )
+    # Trim trailing forward slash ('/') in VLA path if it exists
+    cfg.vla_path = cfg.vla_path.rstrip("/")
+    print(f"Fine-tuning OpenVLA Model `{cfg.vla_path}` on `{cfg.dataset_name}`")
+    # Get experiment run ID
+    run_id = get_run_id(cfg)
+    # Create experiment run directory
+    run_dir = cfg.run_root_dir / run_id
+    os.makedirs(run_dir, exist_ok=True)
+    # GPU setup
+    distributed_state = PartialState()
+    device_id = distributed_state.local_process_index
+    torch.cuda.set_device(device_id)
+    torch.cuda.empty_cache()
+    # Initialize wandb logging
+    if distributed_state.is_main_process:
+        wandb.init(entity=cfg.wandb_entity, project=cfg.wandb_project, name=run_id, id=run_id)
+    # Print detected constants
+    print(
+        "Detected constants:\n"
+        f"\tNUM_ACTIONS_CHUNK: {NUM_ACTIONS_CHUNK}\n"
+        f"\tACTION_DIM: {ACTION_DIM}\n"
+        f"\tPROPRIO_DIM: {PROPRIO_DIM}\n"
+        f"\tACTION_PROPRIO_NORMALIZATION_TYPE: {ACTION_PROPRIO_NORMALIZATION_TYPE}"
+    )
+    # Two options:
+    # (1) Base model is on Hugging Face Hub
+    #   - Then download it and record the path to the download directory
+    # (2) Base model is stored locally
+    #   - Then register model config in HF Auto Classes
+    # In both cases, we want to check whether any changes have been made to
+    # the `modeling_prismatic.py` file in this codebase; if so, we will copy
+    # the file to the downloaded or locally stored checkpoint directory so
+    # that the user's changes to the VLA class logic go into effect
+    if model_is_on_hf_hub(cfg.vla_path):
+        # Download model directly from Hugging Face Hub
+        vla_download_path = snapshot_download(repo_id=cfg.vla_path)
+        # Overwrite VLA path
+        cfg.vla_path = vla_download_path
+    else:
+        # Register OpenVLA model to HF Auto Classes (not needed if the model is on HF Hub)
+        AutoConfig.register("openvla", OpenVLAConfig)
+        AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
+        AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
+        AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)
+    # Update config.json and sync model files
+    if distributed_state.is_main_process:
+        update_auto_map(cfg.vla_path)
+        check_model_logic_mismatch(cfg.vla_path)
+    # Wait for model files to be synced
+    dist.barrier()
+    # Load processor and VLA
+    processor = AutoProcessor.from_pretrained(cfg.vla_path, trust_remote_code=True)
+    vla = AutoModelForVision2Seq.from_pretrained(
+        cfg.vla_path,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    ).to(device_id)
+    # Set number of images in VLA input
+    vla.vision_backbone.set_num_images_in_input(cfg.num_images_in_input)
+    # LoRA setup
+    if cfg.use_lora:
+        lora_config = LoraConfig(
+            r=cfg.lora_rank,
+            lora_alpha=min(cfg.lora_rank, 16),
+            lora_dropout=cfg.lora_dropout,
+            target_modules="all-linear",
+            init_lora_weights="gaussian",
+        )
+        vla = get_peft_model(vla, lora_config)
+        vla.print_trainable_parameters()
+    # FiLM setup
+    if cfg.use_film:
+        count_parameters(vla.vision_backbone, "vla.vision_backbone (original)")
+        # Wrap vision backbone with FiLM wrapper
+        # Important: For this, must specify `vla.model.vision_backbone` instead of just `vla.vision_backbone`, since the
+        # latter would cause the new wrapped backbone to be saved as a new attribute of `vla` instead of overwriting the
+        # original one (due to the LoRA wrapper)
+        vla.model.vision_backbone = FiLMedPrismaticVisionBackbone(
+            vision_backbone=vla.model.vision_backbone,
+            llm_dim=vla.llm_dim,
+        )
+        count_parameters(vla.vision_backbone, "vla.vision_backbone (post-wrap)")
+        if cfg.resume:
+            state_dict = load_checkpoint("vision_backbone", cfg.vla_path, cfg.resume_step)
+            vla.model.vision_backbone.load_state_dict(state_dict)
+        vla.model.vision_backbone = vla.model.vision_backbone.to(device_id)
+    # Wrap VLA with DDP
+    vla = wrap_ddp(vla, device_id, find_unused=False)
+    # vla._set_static_graph()
+    # If applicable, instantiate proprio projector
+    if cfg.use_proprio:
+        proprio_projector = init_module(
+            ProprioProjector,
+            "proprio_projector",
+            cfg,
+            device_id,
+            {"llm_dim": vla.module.llm_dim, "proprio_dim": PROPRIO_DIM},
+        )
+    else:
+        proprio_projector = None
+    # If applicable, instantiate continuous action head for L1 regression
+    if cfg.use_l1_regression:
+        action_head = init_module(
+            L1RegressionActionHead,
+            "action_head",
+            cfg,
+            device_id,
+            {"input_dim": vla.module.llm_dim, "hidden_dim": vla.module.llm_dim, "action_dim": ACTION_DIM},
+            to_bf16=True,
+        )
+    else:
+        action_head = None
+    # If applicable, instantiate diffusion action head and noisy action projector
+    if cfg.use_diffusion:
+        action_head = init_module(
+            DiffusionActionHead,
+            "action_head",
+            cfg,
+            device_id,
+            {
+                "input_dim": vla.module.llm_dim,
+                "hidden_dim": vla.module.llm_dim,
+                "action_dim": ACTION_DIM,
+                "num_diffusion_steps_train": cfg.num_diffusion_steps_train,
+            },
+            to_bf16=True,
+        )
+        noisy_action_projector = init_module(
+            NoisyActionProjector, "noisy_action_projector", cfg, device_id, {"llm_dim": vla.module.llm_dim}
+        )
+    else:
+        noisy_action_projector = None
+    # EMA
+    if cfg.use_ema:
+        ema_vla = EMAModel(vla,
+                           action_head,
+                           proprio_projector,
+                           noisy_action_projector,
+                           inv_gamma=cfg.inv_gamma
+        )
+    # Get number of vision patches
+    NUM_PATCHES = vla.module.vision_backbone.get_num_patches() * vla.module.vision_backbone.get_num_images_in_input()
+    # If we have proprio inputs, a single proprio embedding is appended to the end of the vision patch embeddings
+    if cfg.use_proprio:
+        NUM_PATCHES += 1
+    # For diffusion, a single diffusion timestep embedding is appended to the end of the vision patch embeddings
+    if cfg.use_diffusion:
+        NUM_PATCHES += 1
+    diff_path = cfg.regularization_lora_vector_path  # <- 改成你的
+    # Load diff parameters for regularization
+    diff_params_dict = {}
+    if diff_path and os.path.exists(diff_path):
+        print(f"Loading diff parameters from {diff_path}")
+        diff_params_dict = load_diff_params(diff_path, device="cpu")
+        print(f"Loaded {len(diff_params_dict)} parameters from diff_path")
+    else:
+        print(f"Warning: diff_path {diff_path} does not exist, skipping regularization loss")
+    # Regularization weight (you can make this configurable via cfg if needed)
+    regularization_weight = cfg.regularization_weight  # 可以根据需要调整这个权重
+    # Instantiate optimizer
+    trainable_params = [param for param in vla.parameters() if param.requires_grad]
+    if cfg.use_l1_regression or cfg.use_diffusion:
+        trainable_params += [param for param in action_head.parameters() if param.requires_grad]
+    if cfg.use_diffusion:
+        trainable_params += [param for param in noisy_action_projector.parameters() if param.requires_grad]
+    if cfg.use_proprio:
+        trainable_params += [param for param in proprio_projector.parameters() if param.requires_grad]
+    print(f"# total trainable params: {sum(p.numel() for p in trainable_params)}")
+    optimizer = AdamW(trainable_params, lr=cfg.learning_rate)
+    # Record original learning rate
+    original_lr = optimizer.param_groups[0]["lr"]
+    # Create learning rate scheduler
+    if cfg.scheduler == 'MultiStepLR':
+        scheduler = MultiStepLR(
+            optimizer,
+            milestones=[cfg.num_steps_before_decay],  # Number of steps after which LR will change
+            gamma=0.1,  # Multiplicative factor of learning rate decay
+        )
+    elif cfg.scheduler == 'CosineAnnealingLR':
+        scheduler = CosineAnnealingLR(
+            optimizer,
+            T_max=cfg.max_steps,  # Total number of steps for the cosine annealing
+            eta_min=cfg.learning_rate * 1e-3,
+        )
+    elif cfg.scheduler == 'WarmupCosineLR':
+        scheduler = get_cosine_schedule_with_warmup(
+        optimizer,
+        num_warmup_steps=500,
+        num_training_steps=cfg.max_steps,
+        )
+    else:
+        raise ValueError(f"Unsupported scheduler type: {cfg.scheduler}")
+    # Create Action Tokenizer
+    action_tokenizer = ActionTokenizer(processor.tokenizer)
+    # Load Fine-tuning Dataset =>> note that we use an RLDS-formatted dataset following Open X-Embodiment by default.
+    #   =>> If you want to use a non-RLDS dataset (e.g., a standard PyTorch Dataset) see the following commented block.
+    #   =>> Note that our training code does not loop over epochs because the RLDS loader does this implicitly; if using
+    #       your own Dataset, make sure to add the appropriate logic to the training loop!
+    #
+    # ---
+    # from prismatic.vla.datasets import DummyDataset
+    #
+    # train_dataset = DummyDataset(
+    #     action_tokenizer,
+    #     processor.tokenizer,
+    #     image_transform=processor.image_processor.apply_transform,
+    #     prompt_builder_fn=PurePromptBuilder,
+    # )
+    # ---
+    # We assume that the model takes as input one third-person camera image and 1 or 2 optional wrist camera image(s)
+    use_wrist_image = cfg.num_images_in_input > 1
+    # Create training and optional validation datasets
+    batch_transform = RLDSBatchTransform(
+        action_tokenizer,
+        processor.tokenizer,
+        image_transform=processor.image_processor.apply_transform,
+        prompt_builder_fn=PurePromptBuilder,
+        use_wrist_image=use_wrist_image,
+        use_proprio=cfg.use_proprio,
+    )
+    train_dataset = RLDSDataset(
+        cfg.data_root_dir,
+        cfg.dataset_name,
+        batch_transform,
+        resize_resolution=tuple(vla.module.config.image_sizes),
+        shuffle_buffer_size=cfg.shuffle_buffer_size,
+        image_aug=cfg.image_aug,
+    )
+    if cfg.use_val_set:
+        val_dataset = RLDSDataset(
+            cfg.data_root_dir,
+            cfg.dataset_name,
+            batch_transform,
+            resize_resolution=tuple(vla.module.config.image_sizes),
+            shuffle_buffer_size=cfg.shuffle_buffer_size // 10,
+            image_aug=cfg.image_aug,
+            train=False,
+        )
+    # [Important] Save dataset statistics so that we can unnormalize actions during inference
+    if distributed_state.is_main_process:
+        save_dataset_statistics(train_dataset.dataset_statistics, run_dir)
+    # Create collator and dataloader
+    collator = PaddedCollatorForActionPrediction(
+        processor.tokenizer.model_max_length, processor.tokenizer.pad_token_id, padding_side="right"
+    )
+    dataloader = DataLoader(
+        train_dataset,
+        batch_size=cfg.batch_size,
+        sampler=None,
+        collate_fn=collator,
+        num_workers=0,  # Important: Set to 0 if using RLDS, which uses its own parallelism
+    )
+    if cfg.use_val_set:
+        val_batch_size = cfg.batch_size
+        val_dataloader = DataLoader(
+            val_dataset,
+            batch_size=val_batch_size,
+            sampler=None,
+            collate_fn=collator,
+            num_workers=0,  # Important: Set to 0 if using RLDS, which uses its own parallelism
+        )
+    # Deque to store recent train metrics (used for computing smoothened metrics for gradient accumulation)
+    recent_metrics = {
+        "loss_value": deque(maxlen=cfg.grad_accumulation_steps),
+        "curr_action_accuracy": deque(maxlen=cfg.grad_accumulation_steps),
+        "curr_action_l1_loss": deque(maxlen=cfg.grad_accumulation_steps),
+        "next_actions_accuracy": deque(maxlen=cfg.grad_accumulation_steps),
+        "next_actions_l1_loss": deque(maxlen=cfg.grad_accumulation_steps),
+        "regularization_loss": deque(maxlen=cfg.grad_accumulation_steps),
+    }
+    # Start training
+    with tqdm.tqdm(total=cfg.max_steps, leave=False) as progress:
+        vla.train()
+        optimizer.zero_grad()
+        for batch_idx, batch in enumerate(dataloader):
+            # Compute training metrics and loss
+            compute_diffusion_l1 = cfg.use_diffusion and batch_idx % cfg.diffusion_sample_freq == 0
+            loss, metrics = run_forward_pass(
+                vla=vla,
+                action_head=action_head,
+                noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                proprio_projector=proprio_projector if cfg.use_proprio else None,
+                batch=batch,
+                action_tokenizer=action_tokenizer,
+                device_id=device_id,
+                use_l1_regression=cfg.use_l1_regression,
+                use_diffusion=cfg.use_diffusion,
+                use_proprio=cfg.use_proprio,
+                use_film=cfg.use_film,
+                num_patches=NUM_PATCHES,
+                compute_diffusion_l1=compute_diffusion_l1,
+                num_diffusion_steps_train=cfg.num_diffusion_steps_train if cfg.use_diffusion else None,
+            )
+            # Add regularization loss if diff_params_dict is available
+            if diff_params_dict:
+                ########################### Regularization Loss ##########################
+                regularization_loss = compute_diff_regularization_loss(
+                    vla, diff_params_dict, regularization_weight=regularization_weight
+                )
+                # print(f"正则化loss: {regularization_loss}")
+                # print(f"主loss: {loss}")
+                # 这两行是用于梯度检查的
+                # 保存主loss用于梯度检查
+                # main_loss = loss.clone()
+                # reg_loss = regularization_loss.clone()
+                # print('loss:', loss)
+                # print('regularization_loss:', regularization_loss)
+                # with vla.no_sync():
+                #     regularization_loss.backward()
+                # model_module = vla.module if hasattr(vla, 'module') else vla
+                # reg_grads = {}
+                # for name, param in model_module.named_parameters():
+                #     if "lora_A" in name and param.requires_grad and param.grad is not None:
+                #         reg_grads[name] = param.grad.clone()
+                dummy_loss = 0.0
+                for p in vla.parameters():
+                    if p.requires_grad:
+                        dummy_loss = dummy_loss + p.sum() * 0.0
+                print('action loss:', loss)
+                print('regularization_loss:', regularization_loss)
+                print('dummy_loss:', dummy_loss)
+                loss = loss + regularization_loss + dummy_loss
+                loss.backward()
+                # main_grads = {}
+                # for name, param in model_module.named_parameters():
+                #     if "lora_A" in name and param.requires_grad and param.grad is not None:
+                #         main_grads[name] = param.grad.clone()
+                # print('################################################')
+                # for name in main_grads.keys():
+                #     if name in reg_grads:
+                #         main_grad_norm = main_grads[name].norm().item()
+                #         reg_grad_norm = reg_grads[name].norm().item()
+                #         combined_grad_norm = (main_grads[name] + reg_grads[name]).norm().item()
+                #         print(f"  {name}:")
+                #         print(f"    主loss梯度norm: {main_grad_norm:.6f}")
+                #         print(f"    正则化loss梯度norm: {reg_grad_norm:.6f}")
+                #         print(f"    合并梯度norm: {combined_grad_norm:.6f}")
+            #     print('################################################')
+            #     # Log regularization loss
+            #     metrics["regularization_loss"] = regularization_loss.item()
+            #     #############################################################################
+            # # 这个if下面是用于梯度检查的
+            # # 检查两个loss分别对应的梯度（在backward之前）
+            # if diff_params_dict and batch_idx % cfg.wandb_log_freq == 0:
+            #     # 获取模型参数用于检查梯度
+            #     model_module = vla.module if hasattr(vla, 'module') else vla
+            #     # 先清零梯度
+            #     optimizer.zero_grad()
+            #     # 只对主loss进行backward
+            #     main_loss_normalized = main_loss / cfg.grad_accumulation_steps
+            #     main_loss_normalized.backward(retain_graph=True)
+            #     # 保存主loss的梯度
+            #     main_grads = {}
+            #     for name, param in model_module.named_parameters():
+            #         if "lora_A" in name and param.requires_grad and param.grad is not None:
+            #             main_grads[name] = param.grad.clone()
+            #     # 清零梯度，只对正则化loss进行backward
+            #     optimizer.zero_grad()
+            #     reg_loss_normalized = reg_loss / cfg.grad_accumulation_steps
+            #     reg_loss_normalized.backward(retain_graph=True)
+            #     # 保存正则化loss的梯度
+            #     reg_grads = {}
+            #     for name, param in model_module.named_parameters():
+            #         if "lora_A" in name and param.requires_grad and param.grad is not None:
+            #             reg_grads[name] = param.grad.clone()
+            #     # 打印梯度信息
+            #     print(f"\n[梯度检查] Step {batch_idx // cfg.grad_accumulation_steps}")
+            #     sample_count = 0
+            #     for name in main_grads.keys():
+            #         if name in reg_grads:
+            #             main_grad_norm = main_grads[name].norm().item()
+            #             reg_grad_norm = reg_grads[name].norm().item()
+            #             combined_grad_norm = (main_grads[name] + reg_grads[name]).norm().item()
+            #             print(f"  {name}:")
+            #             print(f"    主loss梯度norm: {main_grad_norm:.6f}")
+            #             print(f"    正则化loss梯度norm: {reg_grad_norm:.6f}")
+            #             print(f"    合并梯度norm: {combined_grad_norm:.6f}")
+            #             sample_count += 1
+            #             if sample_count >= 3:  # 只检查前3个参数作为示例
+            #                 break
+            #     print()
+            #     # 清零梯度，准备正常的backward
+            #     optimizer.zero_grad()
+            # # Normalize loss to account for gradient accumulation
+            # normalized_loss = loss / cfg.grad_accumulation_steps
+            # # Backward pass
+            # normalized_loss.backward()
+            # Store recent train metrics
+            for metric_name, value in metrics.items():
+                if metric_name in recent_metrics:
+                    recent_metrics[metric_name].append(value)
+            # Compute gradient step index
+            gradient_step_idx = batch_idx // cfg.grad_accumulation_steps
+            # Compute smoothened train metrics
+            smoothened_metrics = compute_smoothened_metrics(recent_metrics)
+            # Push Metrics to W&B (every wandb_log_freq gradient steps)
+            log_step = gradient_step_idx if not cfg.resume else cfg.resume_step + gradient_step_idx
+            if distributed_state.is_main_process and log_step % cfg.wandb_log_freq == 0:
+                log_metrics_to_wandb(smoothened_metrics, "VLA Train", log_step, wandb)
+            # [If applicable] Linearly warm up learning rate from 10% to 100% of original
+            if cfg.lr_warmup_steps > 0:
+                lr_progress = min((gradient_step_idx + 1) / cfg.lr_warmup_steps, 1.0)  # Cap at 1.0
+                current_lr = original_lr * (0.1 + 0.9 * lr_progress)
+                for param_group in optimizer.param_groups:
+                    param_group["lr"] = current_lr
+            # Optimizer and LR scheduler step
+            if (batch_idx + 1) % cfg.grad_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()
+                optimizer.zero_grad()
+                progress.update()
+                if cfg.use_ema:
+                    ema_vla.step(vla, action_head, proprio_projector, noisy_action_projector)
+            if distributed_state.is_main_process and gradient_step_idx % cfg.wandb_log_freq == 0:
+                # Log the learning rate
+                # Make sure to do this AFTER any learning rate modifications (e.g., warmup/decay)
+                wandb.log(
+                    {
+                        "VLA Train/Learning Rate": scheduler.get_last_lr()[0],
+                    },
+                    step=log_step,
+                )
+                if cfg.use_ema:
+                    # Log the EMA decay value
+                    wandb.log(
+                        {
+                            "VLA Train/EMA Decay": ema_vla.decay,
+                        },
+                        step=log_step,
+                    )
+                    # Log the EMA eval loss
+                    ema_vla.apply_shadow(vla, action_head, proprio_projector, noisy_action_projector)
+                    with torch.no_grad():
+                        vla.eval()
+                        action_head.eval() if action_head else None
+                        _, ema_metrics = run_forward_pass(
+                            vla=vla,
+                            action_head=action_head,
+                            noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                            proprio_projector=proprio_projector if cfg.use_proprio else None,
+                            batch=batch,
+                            action_tokenizer=action_tokenizer,
+                            device_id=device_id,
+                            use_l1_regression=cfg.use_l1_regression,
+                            use_diffusion=cfg.use_diffusion,
+                            use_proprio=cfg.use_proprio,
+                            use_film=cfg.use_film,
+                            num_patches=NUM_PATCHES,
+                            compute_diffusion_l1=compute_diffusion_l1,
+                            num_diffusion_steps_train=cfg.num_diffusion_steps_train if cfg.use_diffusion else None,
+                        )
+                        ema_loss = ema_metrics['loss_value']
+                        vla.train()
+                        action_head.train() if action_head else None
+                    ema_vla.restore(vla, action_head, proprio_projector, noisy_action_projector)
+                    wandb.log(
+                        {
+                            "VLA Train/EMA Loss": ema_loss,
+                        },
+                        step=log_step,
+                    )
+            # Save model checkpoint: either keep latest checkpoint only or all checkpoints
+            if gradient_step_idx > 0 and log_step % cfg.save_freq == 0:
+                save_training_checkpoint(
+                    cfg=cfg,
+                    run_dir=run_dir,
+                    log_step=log_step,
+                    vla=vla,
+                    processor=processor,
+                    proprio_projector=proprio_projector if cfg.use_proprio else None,
+                    noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                    action_head=action_head if (cfg.use_l1_regression or cfg.use_diffusion) else None,
+                    train_dataset=train_dataset,
+                    distributed_state=distributed_state,
+                )
+                if cfg.use_ema:
+                    # Also save EMA model checkpoint
+                    ema_vla.apply_shadow(vla, action_head, proprio_projector, noisy_action_projector)
+                    save_training_checkpoint(
+                        cfg=cfg,
+                        run_dir=run_dir / "ema_model",
+                        log_step=log_step,
+                        vla=vla,
+                        processor=processor,
+                        proprio_projector=proprio_projector if cfg.use_proprio else None,
+                        noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                        action_head=action_head if (cfg.use_l1_regression or cfg.use_diffusion) else None,
+                        train_dataset=train_dataset,
+                        distributed_state=distributed_state,
+                    )
+                    ema_vla.restore(vla, action_head, proprio_projector, noisy_action_projector)
+            # Test model on validation set
+            if cfg.use_val_set and log_step > 0 and log_step % cfg.val_freq == 0:
+                run_validation(
+                    vla=vla,
+                    action_head=action_head,
+                    noisy_action_projector=noisy_action_projector if cfg.use_diffusion else None,
+                    proprio_projector=proprio_projector if cfg.use_proprio else None,
+                    val_dataloader=val_dataloader,
+                    action_tokenizer=action_tokenizer,
+                    device_id=device_id,
+                    cfg=cfg,
+                    num_patches=NUM_PATCHES,
+                    log_step=log_step,
+                    distributed_state=distributed_state,
+                    val_time_limit=cfg.val_time_limit,
+                )
+                # Set model back to training mode after validation
+                vla.train()
+            # Stop training when max_steps is reached
+            if log_step == cfg.max_steps:
+                print(f"Max step {cfg.max_steps} reached! Stopping training...")
+                break
+if __name__ == "__main__":
+    finetune()

capvector-oft/vla-scripts/merge_lora_weights_and_save.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Loads a checkpoint that only has a LoRA adapter (no merged model) and merges the adapter
+into the base OpenVLA model. Saves the final checkpoint in the same directory.
+Make sure to specify the correct base checkpoint when running this script. For example,
+- if you fine-tuned the default OpenVLA-7B model without modifications, then `--base_checkpoint=="openvla/openvla-7b"`
+- if you fine-tuned a different model or resumed fine-tuning from a different checkpoint, then specify that base checkpoint
+- if you fine-tuned the default OpenVLA-7B model with modifications to `modeling_prismatic.py` (OpenVLA class definition),
+  then the base checkpoint path should point to the checkpoint containing the modifications
+Usage:
+    python vla-scripts/merge_lora_weights_and_save.py \
+        --base_checkpoint openvla/openvla-7b \
+        --lora_finetuned_checkpoint_dir /PATH/TO/CHECKPOINT/DIR/
+"""
+import os
+import time
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+import draccus
+import torch
+from peft import PeftModel
+from transformers import AutoConfig, AutoImageProcessor, AutoModelForVision2Seq, AutoProcessor
+from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
+from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+@dataclass
+class ConvertConfig:
+    # fmt: off
+    base_checkpoint: Union[str, Path] = ""                   # Base model checkpoint path/dir (either openvla/openvla-7b or whichever model you fine-tuned / resumed training from)
+    lora_finetuned_checkpoint_dir: Union[str, Path] = ""     # Checkpoint directory containing the LoRA adapter
+    # fmt: on
+@draccus.wrap()
+def main(cfg: ConvertConfig) -> None:
+    # Register OpenVLA model to HF Auto Classes (not needed if the model is on HF Hub)
+    AutoConfig.register("openvla", OpenVLAConfig)
+    AutoImageProcessor.register(OpenVLAConfig, PrismaticImageProcessor)
+    AutoProcessor.register(OpenVLAConfig, PrismaticProcessor)
+    AutoModelForVision2Seq.register(OpenVLAConfig, OpenVLAForActionPrediction)
+    # Load Model using HF AutoClasses
+    print(f"Loading base model: {cfg.base_checkpoint}")
+    vla = AutoModelForVision2Seq.from_pretrained(
+        cfg.base_checkpoint,
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
+    # Load LoRA weights and merge into base model, then save final checkpoint
+    print("Merging LoRA weights into base model...")
+    start_time = time.time()
+    merged_vla = PeftModel.from_pretrained(vla, os.path.join(cfg.lora_finetuned_checkpoint_dir, "lora_adapter")).to(
+        "cuda"
+    )
+    merged_vla = merged_vla.merge_and_unload()
+    merged_vla.save_pretrained(cfg.lora_finetuned_checkpoint_dir)
+    print(f"\nMerging complete! Time elapsed (sec): {time.time() - start_time}")
+    print(f"\nSaved merged model checkpoint at:\n{cfg.lora_finetuned_checkpoint_dir}")
+if __name__ == "__main__":
+    main()

capvector-pi05/.dockerignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.venv
+checkpoints
+data

capvector-pi05/.gitignore ADDED Viewed

	@@ -0,0 +1,169 @@

+# Data directories.
+assets/
+checkpoints/
+data/
+wandb/
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+.vscode/

capvector-pi05/.gitmodules ADDED Viewed

	@@ -0,0 +1,6 @@

+[submodule "third_party/aloha"]
+	path = third_party/aloha
+	url = https://github.com/Physical-Intelligence/aloha.git
+[submodule "third_party/libero"]
+	path = third_party/libero
+	url = https://github.com/Lifelong-Robot-Learning/LIBERO.git

capvector-pi05/.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+exclude: third_party/
+repos:
+  - repo: https://github.com/astral-sh/uv-pre-commit
+    # uv version.
+    rev: 0.5.14
+    hooks:
+      - id: uv-lock
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    # Ruff version.
+    rev: v0.8.6
+    hooks:
+      # Run the linter.
+      - id: ruff
+        args: [--fix]
+      - id: ruff-format

capvector-pi05/.python-version ADDED Viewed

	@@ -0,0 +1 @@


1	+ 3.11

capvector-pi05/LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

capvector-pi05/README.md ADDED Viewed

	@@ -0,0 +1,128 @@

+## 1. Environment Setup
+We use [uv](https://docs.astral.sh/uv/) to manage Python dependencies. See the [uv installation instructions](https://docs.astral.sh/uv/getting-started/installation/) to set it up. Once uv is installed, run the following to set up the environment:
+```bash
+GIT_LFS_SKIP_SMUDGE=1 uv sync
+GIT_LFS_SKIP_SMUDGE=1 uv pip install -e .
+cp -r ./src/openpi/models_pytorch/transformers_replace/* .venv/lib/python3.11/site-packages/transformers/
+source .venv/bin/activate
+```
+NOTE: `GIT_LFS_SKIP_SMUDGE=1` is needed to pull LeRobot as a dependency.
+## 2. Data Preparation
+Here we take the real-world Aloha data as example, more detail simulation data could be refered in the [official openpi repo](https://github.com/Physical-Intelligence/openpi/).
+First, you need to collect the task-specific raw data with your own robot, and save it in the `.hdf5` format.
+Then, convert the data to LeRobot dataset format.
+```bash
+uv run examples/aloha_real/convert_aloha_data_to_lerobot.py --raw-dir /path/to/raw/data --repo-id <org>/<dataset-name>
+# By default, The converted data is stored in ~/.cache/huggingface/lerobot/<org>/<dataset-name>/
+```
+## 3. Obtain the capability vectors and merge it to obtain $\theta_{meta}$
+First, define your task-specific config in [config.py](src/openpi/training/config.py). And we provide an example of our real-world task [here](src/openpi/training/config.py#L776-L808).
+Then, convert a JAX model checkpoint to PyTorch format:
+```bash
+uv run examples/convert_jax_model_to_pytorch.py \
+    --checkpoint_dir gs://openpi-assets/checkpoints/pi05_base \
+    --config_name <config_name> \
+    --output_path checkpoints/pytorch_pi05_base
+# This command will automatically download pi05_base checkpoint to ~/.cache/openpi/openpi-assets/checkpoints/pi05_base/
+# Otherwise you can download it manually and modify the --checkpoint_dir
+```
+> ⭐ If you don't use the regularization strategy, you could download the [capability-merged meta model](https://huggingface.co/haofuly/capvector_models_collection/capvector_pi05/merged_model) we provided, place it at `./checkpoints/vector_init/pi05SF-LIBEROspatial_minus_pi05-LIBEROspatial/`, and directly jump to the next [Training step](#4-training).
+Then, the capability vectors are obtained by simply conducting parameter arithmetic between two models finetuned with different strategies. Therefore, we need to prepare these two trained models, *e.g.*, [Pi0.5 on LIBERO-Spatial)](https://huggingface.co/haofuly/capvector_models_collection/capvector_pi05/pi05_baseline_30000step_spatial) and [Pi0.5-SF on LIBERO-Spatial)](https://huggingface.co/haofuly/capvector_models_collection/capvector_pi05/pi05_spatialforcing_30000step_spatial). The directory structure is as below:
+```
+capvector-pi05
+    ├── checkpoints
+    ·   ├── pi05-LIBEROspatial
+        │   ├── model.safetensors
+        │   └── ...
+        ├── pi05SF-LIBEROspatial
+        │   ├── model.safetensors
+        │   └── ...
+        ├── diff
+        ├── vector_init
+        ·
+```
+Next, conduct parameter arithmetic between these two models:
+```bash
+CONFIG=pi05_capvector_aloha_place_block && \
+EXT=pi05SF-LIBEROspatial && \
+DOWN=pi05-LIBEROspatial && \
+uv run capvector/compute_param_diff.py \
+  --config $CONFIG \
+  --a.dir checkpoints/$EXT \
+  --b.dir checkpoints/$DOWN \
+  --out checkpoints/diff/${EXT}_minus_${DOWN}.pth \
+  --strict-keys \
+  --dtype fp32
+```
+Finally, merge these diff parameters to obtain $\theta_{meta}:
+```bash
+DIFF=pi05SF-LIBEROspatial_minus_pi05-LIBEROspatial && \
+uv run capvector/apply_param_diff.py \
+  --base-safetensors checkpoints/pytorch_pi05_base/model.safetensors \
+  --diff-pth checkpoints/diff/${DIFF}.pth \
+  --out-safetensors checkpoints/vector_init/${DIFF}/model.safetensors \
+  --scale 1.0 \
+  --no-strict-keys \
+  --dtype fp32 \
+  --device cpu
+```
+## 4. Training
+First, you need to compute the normalization statistics for the training data.
+```bash
+uv run scripts/compute_norm_stats.py --config-name <config_name>
+```
+Finally, launch training using one of these modes:
+```bash
+# Single GPU training:
+uv run scripts/train_regular_loss_pytorch.py <config_name> --exp_name <run_name> --save_interval <interval>
+# Example:
+uv run scripts/train_regular_loss_pytorch.py pi05_capvector_aloha_place_block --exp_name pytorch_test
+uv run scripts/train_regular_loss_pytorch.py pi05_capvector_aloha_place_block --exp_name pytorch_test --overwrite  # Overwrite existing checkpoints
+# Multi-GPU training (single node):
+uv run torchrun --standalone --nnodes=1 --nproc_per_node=<num_gpus> scripts/train_regular_loss_pytorch.py <config_name> --exp_name <run_name>
+# Multi-Node Training:
+uv run torchrun \
+    --nnodes=<num_nodes> \
+    --nproc_per_node=<gpus_per_node> \
+    --node_rank=<rank_of_node> \
+    --master_addr=<master_ip> \
+    --master_port=<port> \
+    scripts/train_regular_loss_pytorch.py <config_name> --exp_name=<run_name> --save_interval <interval>
+```
+## 5. Inference
+Real-world inference is executed in the server-client form.
+First, launch a model server (we use the checkpoint for iteration 20,000 for this example, modify as needed):
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=<config_name> --policy.dir=checkpoints/<config_name>/<run_name>/20000
+```
+This will spin up a server that listens on port 8000 and waits for observations to be sent to it.
+Then, We can then run an client robot script that queries the server.
+You need to write your client script according to your robot. A simple [client exmaple](examples/simple_client/main.py) is as below:
+```bash
+uv run examples/simple_client/main.py --env ALOHA
+```

capvector-pi05/capvector/apply_param_diff.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import dataclasses
+import logging
+from pathlib import Path
+import torch
+import tyro
+from safetensors.torch import load_file, save_file
+@dataclasses.dataclass
+class Args:
+    # Base pretrained weights in safetensors
+    base_safetensors: str
+    # Diff checkpoint in .pth (either {"state_dict": ...} or raw state_dict)
+    diff_pth: str
+    # Output safetensors path
+    out_safetensors: str = "model_merged.safetensors"
+    # final = base + scale * diff
+    scale: float = 1.0
+    # whether keys must match exactly
+    strict_keys: bool = True  # use --strict-keys / --no-strict-keys
+    # arithmetic dtype
+    dtype: str = "fp32"  # fp32/fp16/bf16
+    # compute device
+    device: str = "cpu"  # cpu/cuda
+def cast(t: torch.Tensor, dtype: str) -> torch.Tensor:
+    if dtype == "fp32":
+        return t.float()
+    if dtype == "fp16":
+        return t.half()
+    if dtype == "bf16":
+        return t.bfloat16()
+    raise ValueError(f"Unknown dtype: {dtype}")
+def load_diff_state_dict(path: str) -> dict[str, torch.Tensor]:
+    obj = torch.load(path, map_location="cpu")
+    if isinstance(obj, dict) and "state_dict" in obj and isinstance(obj["state_dict"], dict):
+        sd = obj["state_dict"]
+    elif isinstance(obj, dict):
+        sd = obj
+    else:
+        raise RuntimeError(f"Unexpected diff format: {type(obj)}")
+    for k, v in sd.items():
+        if not isinstance(v, torch.Tensor):
+            raise RuntimeError(f"Diff contains non-tensor at key={k}: {type(v)}")
+    return sd
+def main(args: Args) -> None:
+    logging.info("Loading base safetensors: %s", args.base_safetensors)
+    base_sd = load_file(args.base_safetensors, device="cpu")  # dict[str, Tensor]
+    logging.info("Loading diff pth: %s", args.diff_pth)
+    diff_sd = load_diff_state_dict(args.diff_pth)
+    keys_base = set(base_sd.keys())
+    keys_diff = set(diff_sd.keys())
+    if args.strict_keys:
+        if keys_base != keys_diff:
+            only_base = sorted(list(keys_base - keys_diff))[:30]
+            only_diff = sorted(list(keys_diff - keys_base))[:30]
+            raise RuntimeError(
+                "Keys mismatch between base safetensors and diff.\n"
+                f"Only in base (up to 30): {only_base}\n"
+                f"Only in diff (up to 30): {only_diff}\n"
+                "Use --no-strict-keys to apply on intersection only."
+            )
+        keys_apply = keys_base
+    else:
+        keys_apply = keys_base & keys_diff
+        logging.warning("Non-strict mode: applying on intersection keys: %d", len(keys_apply))
+    dev = torch.device(args.device)
+    merged_sd: dict[str, torch.Tensor] = {}
+    applied_float = 0
+    skipped_nonfloat = 0
+    skipped_missing = 0
+    for k, base_t_cpu in base_sd.items():
+        base_t = base_t_cpu  # already on cpu
+        if k not in keys_apply:
+            merged_sd[k] = base_t
+            skipped_missing += 1
+            continue
+        diff_t_cpu = diff_sd[k]
+        if base_t.shape != diff_t_cpu.shape:
+            raise RuntimeError(f"Shape mismatch at key={k}: base {base_t.shape} vs diff {diff_t_cpu.shape}")
+        # only add for floating-point tensors
+        if base_t.is_floating_point() and diff_t_cpu.is_floating_point():
+            a = cast(base_t.to(dev), args.dtype)
+            d = cast(diff_t_cpu.to(dev), args.dtype)
+            out = a + args.scale * d
+            merged_sd[k] = out.to(base_t.dtype).detach().cpu()
+            applied_float += 1
+        else:
+            merged_sd[k] = base_t
+            skipped_nonfloat += 1
+    out_path = Path(args.out_safetensors)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    # safetensors 需要所有 tensor 在 CPU
+    for k, v in merged_sd.items():
+        if v.device.type != "cpu":
+            merged_sd[k] = v.cpu()
+    logging.info(
+        "Done. applied_float=%d, skipped_nonfloat=%d, skipped_missing=%d",
+        applied_float,
+        skipped_nonfloat,
+        skipped_missing,
+    )
+    logging.info("Saving merged safetensors to: %s", str(out_path))
+    save_file(merged_sd, str(out_path))
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    main(tyro.cli(Args))

capvector-pi05/capvector/compute_param_diff.py ADDED Viewed

	@@ -0,0 +1,142 @@

+import dataclasses
+import logging
+from pathlib import Path
+from typing import Any
+import torch
+import tyro
+from openpi.training import config as _config
+@dataclasses.dataclass
+class CkptSpec:
+    dir: str
+@dataclasses.dataclass
+class Args:
+    config: str
+    a: CkptSpec
+    b: CkptSpec
+    out: str = "checkpoints/diff/a_minus_b.pth"
+    only_vlm: bool = False
+    strict_keys: bool = False
+    dtype: str = "fp32"
+    device: str = "cpu"
+def _extract_state_dict(obj: Any) -> dict[str, torch.Tensor]:
+    """
+    Try best to get a torch state_dict from a Policy or Module-like object.
+    """
+    # Case 1: policy itself has state_dict()
+    if hasattr(obj, "state_dict") and callable(obj.state_dict):
+        sd = obj.state_dict()
+        if isinstance(sd, dict) and all(isinstance(v, torch.Tensor) for v in sd.values()):
+            return sd
+    # Case 2: common attributes that hold torch.nn.Module
+    for attr in ["model", "_model", "module", "net", "_net", "policy", "_policy"]:
+        if hasattr(obj, attr):
+            m = getattr(obj, attr)
+            if hasattr(m, "state_dict") and callable(m.state_dict):
+                sd = m.state_dict()
+                if isinstance(sd, dict) and all(isinstance(v, torch.Tensor) for v in sd.values()):
+                    return sd
+    raise RuntimeError(
+        "Cannot extract state_dict. "
+        "Please inspect Policy object and update attribute list in _extract_state_dict()."
+    )
+def _cast_tensor(t: torch.Tensor, dtype: str) -> torch.Tensor:
+    if dtype == "fp32":
+        return t.float()
+    if dtype == "fp16":
+        return t.half()
+    if dtype == "bf16":
+        return t.bfloat16()
+    raise ValueError(f"Unknown dtype: {dtype}")
+def load_model(config_name: str, spec: CkptSpec):
+    cfg = _config.get_config(config_name)
+    weight_path = Path(spec.dir) / "model.safetensors"
+    if not weight_path.exists():
+        raise FileNotFoundError(f"Missing model.safetensors in checkpoint directory: {spec.dir}")
+    return cfg.model.load_pytorch(cfg, str(weight_path))
+def main(args: Args) -> None:
+    logging.info("Loading A model from %s with config %s", args.a.dir, args.config)
+    model_a = load_model(args.config, args.a)
+    logging.info("Loading B model from %s with config %s", args.b.dir, args.config)
+    model_b = load_model(args.config, args.b)
+    sd_a = _extract_state_dict(model_a)
+    sd_b = _extract_state_dict(model_b)
+    keys_a = set(sd_a.keys())
+    keys_b = set(sd_b.keys())
+    if args.strict_keys:
+        if keys_a != keys_b:
+            only_a = sorted(list(keys_a - keys_b))[:20]
+            only_b = sorted(list(keys_b - keys_a))[:20]
+            raise RuntimeError(
+                f"State dict keys mismatch.\n"
+                f"Only in A (show up to 20): {only_a}\n"
+                f"Only in B (show up to 20): {only_b}\n"
+                f"Set --strict-keys False to subtract intersection only."
+            )
+        keys = sorted(keys_a)
+    else:
+        keys = sorted(list(keys_a & keys_b))
+        logging.warning("Non-strict mode: subtracting only intersection keys: %d", len(keys))
+    device = torch.device(args.device)
+    diff: dict[str, torch.Tensor] = {}
+    if args.only_vlm:
+        ZERO_PREFIXES = [
+            "paligemma_with_expert.gemma_expert.",
+            "action_in_proj.",
+            "action_out_proj.",
+            "action_time_mlp_in",
+            "action_time_mlp_oout",
+        ]
+    else:
+        ZERO_PREFIXES = []
+    for k in keys:
+        ta = sd_a[k].to(device)
+        tb = sd_b[k].to(device)
+        if ta.shape != tb.shape:
+            raise RuntimeError(f"Shape mismatch at key={k}: {ta.shape} vs {tb.shape}")
+        zero_this = any(k.startswith(p) for p in ZERO_PREFIXES)
+        if zero_this:
+            out = torch.zeros_like(ta)
+        else:
+            if ta.is_floating_point():
+                out = _cast_tensor(ta, args.dtype) - _cast_tensor(tb, args.dtype)
+            else:
+                out = ta
+        diff[k] = out.detach().cpu()
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    torch.save({"state_dict": diff, "a": dataclasses.asdict(args.a), "b": dataclasses.asdict(args.b)}, out_path)
+    logging.info("Saved diff checkpoint to: %s", str(out_path))
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    main(tyro.cli(Args))

capvector-pi05/docs/docker.md ADDED Viewed

	@@ -0,0 +1,25 @@

+### Docker Setup
+All of the examples in this repo provide instructions for being run normally, and also using Docker. Although not required, the Docker option is recommended as this will simplify software installation, produce a more stable environment, and also allow you to avoid installing ROS and cluttering your machine, for examples which depend on ROS.
+- Basic Docker installation instructions are [here](https://docs.docker.com/engine/install/).
+- Docker must be installed in [rootless mode](https://docs.docker.com/engine/security/rootless/).
+- To use your GPU you must also install the [NVIDIA container toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html).
+- The version of docker installed with `snap` is incompatible with the NVIDIA container toolkit, preventing it from accessing `libnvidia-ml.so` ([issue](https://github.com/NVIDIA/nvidia-container-toolkit/issues/154)). The snap version can be uninstalled with `sudo snap remove docker`.
+- Docker Desktop is also incompatible with the NVIDIA runtime ([issue](https://github.com/NVIDIA/nvidia-container-toolkit/issues/229)). Docker Desktop can be uninstalled with `sudo apt remove docker-desktop`.
+If starting from scratch and your host machine is Ubuntu 22.04, you can use accomplish all of the above with the convenience scripts `scripts/docker/install_docker_ubuntu22.sh` and `scripts/docker/install_nvidia_container_toolkit.sh`.
+Build the Docker image and start the container with the following command:
+```bash
+docker compose -f scripts/docker/compose.yml up --build
+```
+To build and run the Docker image for a specific example, use the following command:
+```bash
+docker compose -f examples/<example_name>/compose.yml up --build
+```
+where `<example_name>` is the name of the example you want to run.
+During the first run of any example, Docker will build the images. Go grab a coffee while this happens. Subsequent runs will be faster since the images are cached.

capvector-pi05/docs/norm_stats.md ADDED Viewed

	@@ -0,0 +1,69 @@

+# Normalization statistics
+Following common practice, our models normalize the proprioceptive state inputs and action targets during policy training and inference. The statistics used for normalization are computed over the training data and stored alongside the model checkpoint.
+## Reloading normalization statistics
+When you fine-tune one of our models on a new dataset, you need to decide whether to (A) reuse existing normalization statistics or (B) compute new statistics over your new training data. Which option is better for you depends on the similarity of your robot and task to the robot and task distribution in the pre-training dataset. Below, we list all the available pre-training normalization statistics for each model.
+**If your target robot matches one of these pre-training statistics, consider reloading the same normalization statistics.** By reloading the normalization statistics, the actions in your dataset will be more "familiar" to the model, which can lead to better performance. You can reload the normalization statistics by adding an `AssetsConfig` to your training config that points to the corresponding checkpoint directory and normalization statistics ID, like below for the `Trossen` (aka ALOHA) robot statistics of the `pi0_base` checkpoint:
+```python
+TrainConfig(
+    ...
+    data=LeRobotAlohaDataConfig(
+        ...
+        assets=AssetsConfig(
+            assets_dir="gs://openpi-assets/checkpoints/pi0_base/assets",
+            asset_id="trossen",
+        ),
+    ),
+)
+```
+For an example of a full training config that reloads normalization statistics, see the `pi0_aloha_pen_uncap` config in the [training config file](https://github.com/physical-intelligence/openpi/blob/main/src/openpi/training/config.py).
+**Note:** To successfully reload normalization statistics, it's important that your robot + dataset are following the action space definitions used in pre-training. We provide a detailed description of our action space definitions below.
+**Note #2:** Whether reloading normalization statistics is beneficial depends on the similarity of your robot and task to the robot and task distribution in the pre-training dataset. We recommend to always try both, reloading and training with a fresh set of statistics computed on your new dataset (see [main README](../README.md) for instructions on how to compute new statistics), and pick the one that works better for your task.
+## Provided Pre-training Normalization Statistics
+Below is a list of all the pre-training normalization statistics we provide. We provide them for both, the `pi0_base` and `pi0_fast_base` models. For `pi0_base`, set the `assets_dir` to `gs://openpi-assets/checkpoints/pi0_base/assets` and for `pi0_fast_base`, set the `assets_dir` to `gs://openpi-assets/checkpoints/pi0_fast_base/assets`.
+| Robot | Description | Asset ID |
+|-------|-------------|----------|
+| ALOHA | 6-DoF dual arm robot with parallel grippers | trossen |
+| Mobile ALOHA | Mobile version of ALOHA mounted on a Slate base | trossen_mobile |
+| Franka Emika (DROID) | 7-DoF arm with parallel gripper based on the DROID setup | droid |
+| Franka Emika (non-DROID) | Franka FR3 arm with Robotiq 2F-85 gripper | franka |
+| UR5e | 6-DoF UR5e arm with Robotiq 2F-85 gripper | ur5e |
+| UR5e bi-manual | Bi-manual UR5e setup with Robotiq 2F-85 grippers | ur5e_dual |
+| ARX | Bi-manual ARX-5 robot arm setup with parallel gripper | arx |
+| ARX mobile | Mobile version of bi-manual ARX-5 robot arm setup mounted on a Slate base | arx_mobile |
+| Fibocom mobile | Fibocom mobile robot with 2x ARX-5 arms | fibocom_mobile |
+## Pi0 Model Action Space Definitions
+Out of the box, both the `pi0_base` and `pi0_fast_base` use the following action space definitions (left and right are defined looking from behind the robot towards the workspace):
+```
+    "dim_0:dim_5": "left arm joint angles",
+    "dim_6": "left arm gripper position",
+    "dim_7:dim_12": "right arm joint angles (for bi-manual only)",
+    "dim_13": "right arm gripper position (for bi-manual only)",
+    # For mobile robots:
+    "dim_14:dim_15": "x-y base velocity (for mobile robots only)",
+```
+The proprioceptive state uses the same definitions as the action space, except for the base x-y position (the last two dimensions) for mobile robots, which we don't include in the proprioceptive state.
+For 7-DoF robots (e.g. Franka), we use the first 7 dimensions of the action space for the joint actions, and the 8th dimension for the gripper action.
+General info for Pi robots:
+- Joint angles are expressed in radians, with position zero corresponding to the zero position reported by each robot's interface library, except for ALOHA, where the standard ALOHA code uses a slightly different convention (see the [ALOHA example code](../examples/aloha_real/README.md) for details).
+- Gripper positions are in [0.0, 1.0], with 0.0 corresponding to fully open and 1.0 corresponding to fully closed.
+- Control frequencies are either 20 Hz for UR5e and Franka, and 50 Hz for ARX and Trossen (ALOHA) arms.
+For DROID, we use the original DROID action configuration, with joint velocity actions in the first 7 dimensions and gripper actions in the 8th dimension + a control frequency of 15 Hz.

capvector-pi05/docs/remote_inference.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# Running openpi models remotely
+We provide utilities for running openpi models remotely. This is useful for running inference on more powerful GPUs off-robot, and also helps keep the robot and policy environments separate (and e.g. avoid dependency hell with robot software).
+## Starting a remote policy server
+To start a remote policy server, you can simply run the following command:
+```bash
+uv run scripts/serve_policy.py --env=[DROID | ALOHA | LIBERO]
+```
+The `env` argument specifies which $\pi_0$ checkpoint should be loaded. Under the hood, this script will execute a command like the following, which you can use to start a policy server, e.g. for checkpoints you trained yourself (here an example for the DROID environment):
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_fast_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_fast_droid
+```
+This will start a policy server that will serve the policy specified by the `config` and `dir` arguments. The policy will be served on the specified port (default: 8000).
+## Querying the remote policy server from your robot code
+We provide a client utility with minimal dependencies that you can easily embed into any robot codebase.
+First, install the `openpi-client` package in your robot environment:
+```bash
+cd $OPENPI_ROOT/packages/openpi-client
+pip install -e .
+```
+Then, you can use the client to query the remote policy server from your robot code. Here's an example of how to do this:
+```python
+from openpi_client import image_tools
+from openpi_client import websocket_client_policy
+# Outside of episode loop, initialize the policy client.
+# Point to the host and port of the policy server (localhost and 8000 are the defaults).
+client = websocket_client_policy.WebsocketClientPolicy(host="localhost", port=8000)
+for step in range(num_steps):
+    # Inside the episode loop, construct the observation.
+    # Resize images on the client side to minimize bandwidth / latency. Always return images in uint8 format.
+    # We provide utilities for resizing images + uint8 conversion so you match the training routines.
+    # The typical resize_size for pre-trained pi0 models is 224.
+    # Note that the proprioceptive `state` can be passed unnormalized, normalization will be handled on the server side.
+    observation = {
+        "observation/image": image_tools.convert_to_uint8(
+            image_tools.resize_with_pad(img, 224, 224)
+        ),
+        "observation/wrist_image": image_tools.convert_to_uint8(
+            image_tools.resize_with_pad(wrist_img, 224, 224)
+        ),
+        "observation/state": state,
+        "prompt": task_instruction,
+    }
+    # Call the policy server with the current observation.
+    # This returns an action chunk of shape (action_horizon, action_dim).
+    # Note that you typically only need to call the policy every N steps and execute steps
+    # from the predicted action chunk open-loop in the remaining steps.
+    action_chunk = client.infer(observation)["actions"]
+    # Execute the actions in the environment.
+    ...
+```
+Here, the `host` and `port` arguments specify the IP address and port of the remote policy server. You can also specify these as command-line arguments to your robot code, or hard-code them in your robot codebase. The `observation` is a dictionary of observations and the prompt, following the specification of the policy inputs for the policy you are serving. We have concrete examples of how to construct this dictionary for different environments in the [simple client example](../examples/simple_client/main.py).

capvector-pi05/examples/aloha_real/Dockerfile ADDED Viewed

	@@ -0,0 +1,70 @@

+# Dockerfile for the Aloha real environment.
+# Build the container:
+# docker build . -t aloha_real -f examples/aloha_real/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v /dev:/dev -v .:/app --privileged aloha_real /bin/bash
+FROM ros:noetic-robot@sha256:7cf0b9f6546abeba308ea42cb7ad3453f3e520e1af57cdf179fe915c939674bc
+SHELL ["/bin/bash", "-c"]
+ENV DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+    cmake \
+    curl \
+    libffi-dev \
+    python3-rosdep \
+    python3-rosinstall \
+    python3-rosinstall-generator \
+    whiptail \
+    git \
+    wget \
+    openssh-client \
+    ros-noetic-cv-bridge \
+    ros-noetic-usb-cam \
+    ros-noetic-realsense2-camera \
+    keyboard-configuration
+WORKDIR /root
+RUN curl 'https://raw.githubusercontent.com/Interbotix/interbotix_ros_manipulators/main/interbotix_ros_xsarms/install/amd64/xsarm_amd64_install.sh' > xsarm_amd64_install.sh
+RUN chmod +x xsarm_amd64_install.sh
+RUN export TZ='America/Los_Angeles' && ./xsarm_amd64_install.sh -d noetic -n
+COPY ./third_party/aloha /root/interbotix_ws/src/aloha
+RUN cd /root/interbotix_ws && source /opt/ros/noetic/setup.sh && source /root/interbotix_ws/devel/setup.sh && catkin_make
+# Install python 3.10 because this ROS image comes with 3.8
+RUN mkdir /python && \
+    cd /python && \
+    wget https://www.python.org/ftp/python/3.10.14/Python-3.10.14.tgz && \
+    tar -zxvf Python-3.10.14.tgz && \
+    cd Python-3.10.14 && \
+    ls -lhR && \
+    ./configure --enable-optimizations && \
+    make install && \
+    echo 'alias python3="/usr/local/bin/python3.10"' >> ~/.bashrc && \
+    echo 'alias python="/usr/local/bin/python3.10"' >> ~/.bashrc && \
+    cd ~ && rm -rf /python && \
+    rm -rf /var/lib/apt/lists/*
+COPY --from=ghcr.io/astral-sh/uv:0.5.6 /uv /bin/uv
+ENV UV_HTTP_TIMEOUT=120
+ENV UV_LINK_MODE=copy
+COPY ./examples/aloha_real/requirements.txt /tmp/requirements.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+RUN uv pip sync --python 3.10 --system /tmp/requirements.txt /tmp/openpi-client/pyproject.toml
+ENV PYTHONPATH=/app:/app/src:/app/packages/openpi-client/src:/root/interbotix_ws/src/aloha/aloha_scripts:/root/interbotix_ws/src/aloha
+WORKDIR /app
+# Create an entrypoint script to run the setup commands, followed by the command passed in.
+RUN cat <<'EOF' > /usr/local/bin/entrypoint.sh
+#!/bin/bash
+source /opt/ros/noetic/setup.sh && source /root/interbotix_ws/devel/setup.sh && "$@"
+EOF
+RUN chmod +x /usr/local/bin/entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
+CMD ["python3", "/app/examples/aloha_real/main.py"]

capvector-pi05/examples/aloha_real/README.md ADDED Viewed

	@@ -0,0 +1,126 @@

+# Run Aloha (Real Robot)
+This example demonstrates how to run with a real robot using an [ALOHA setup](https://github.com/tonyzhaozh/aloha). See [here](../../docs/remote_inference.md) for instructions on how to load checkpoints and run inference. We list the relevant checkpoint paths for each provided fine-tuned model below.
+## Prerequisites
+This repo uses a fork of the ALOHA repo, with very minor modifications to use Realsense cameras.
+1. Follow the [hardware installation instructions](https://github.com/tonyzhaozh/aloha?tab=readme-ov-file#hardware-installation) in the ALOHA repo.
+1. Modify the `third_party/aloha/aloha_scripts/realsense_publisher.py` file to use serial numbers for your cameras.
+## With Docker
+```bash
+export SERVER_ARGS="--env ALOHA --default_prompt='take the toast out of the toaster'"
+docker compose -f examples/aloha_real/compose.yml up --build
+```
+## Without Docker
+Terminal window 1:
+```bash
+# Create virtual environment
+uv venv --python 3.10 examples/aloha_real/.venv
+source examples/aloha_real/.venv/bin/activate
+uv pip sync examples/aloha_real/requirements.txt
+uv pip install -e packages/openpi-client
+# Run the robot
+python -m examples.aloha_real.main
+```
+Terminal window 2:
+```bash
+roslaunch aloha ros_nodes.launch
+```
+Terminal window 3:
+```bash
+uv run scripts/serve_policy.py --env ALOHA --default_prompt='take the toast out of the toaster'
+```
+## **ALOHA Checkpoint Guide**
+The `pi0_base` model can be used in zero shot for a simple task on the ALOHA platform, and we additionally provide two example fine-tuned checkpoints, “fold the towel” and “open the tupperware and put the food on the plate,” which can perform more advanced tasks on the ALOHA.
+While we’ve found the policies to work in unseen conditions across multiple ALOHA stations, we provide some pointers here on how best to set up scenes to maximize the chance of policy success. We cover the prompts to use for the policies, objects we’ve seen it work well on, and well-represented initial state distributions. Running these policies in zero shot is still a very experimental feature, and there is no guarantee that they will work on your robot. The recommended way to use `pi0_base` is by finetuning with data from the target robot.
+---
+### **Toast Task**
+This task involves the robot taking two pieces of toast out of a toaster and placing them on a plate.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_base`
+- **Prompt**: "take the toast out of the toaster"
+- **Objects needed**: Two pieces of toast, a plate, and a standard toaster.
+- **Object Distribution**:
+  - Works on both real toast and rubber fake toast
+  - Compatible with standard 2-slice toasters
+  - Works with plates of varying colors
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 06 02 PM" src="https://github.com/user-attachments/assets/3d043d95-9d1c-4dda-9991-e63cae61e02e" />
+- The toaster should be positioned in the top-left quadrant of the workspace.
+- Both pieces of toast should start inside the toaster, with at least 1 cm of bread sticking out from the top.
+- The plate should be placed roughly in the lower-center of the workspace.
+- Works with both natural and synthetic lighting, but avoid making the scene too dark (e.g., don't place the setup inside an enclosed space or under a curtain).
+### **Towel Task**
+This task involves folding a small towel (e.g., roughly the size of a hand towel) into eighths.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_aloha_towel`
+- **Prompt**: "fold the towel"
+- **Object Distribution**:
+  - Works on towels of varying solid colors
+  - Performance is worse on heavily textured or striped towels
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 01 15 PM" src="https://github.com/user-attachments/assets/9410090c-467d-4a9c-ac76-96e5b4d00943" />
+- The towel should be flattened and roughly centered on the table.
+- Choose a towel that does not blend in with the table surface.
+### **Tupperware Task**
+This task involves opening a tupperware filled with food and pouring the contents onto a plate.
+- **Checkpoint path**: `gs://openpi-assets/checkpoints/pi0_aloha_tupperware`
+- **Prompt**: "open the tupperware and put the food on the plate"
+- **Objects needed**: Tupperware, food (or food-like items), and a plate.
+- **Object Distribution**:
+  - Works on various types of fake food (e.g., fake chicken nuggets, fries, and fried chicken).
+  - Compatible with tupperware of different lid colors and shapes, with best performance on square tupperware with a corner flap (see images below).
+  - The policy has seen plates of varying solid colors.
+### **Scene Setup Guidelines**
+<img width="500" alt="Screenshot 2025-01-31 at 10 02 27 PM" src="https://github.com/user-attachments/assets/60fc1de0-2d64-4076-b903-f427e5e9d1bf" />
+- Best performance observed when both the tupperware and plate are roughly centered in the workspace.
+- Positioning:
+  - Tupperware should be on the left.
+  - Plate should be on the right or bottom.
+  - The tupperware flap should point toward the plate.
+## Training on your own Aloha dataset
+1. Convert the dataset to the LeRobot dataset v2.0 format.
+    We provide a script [convert_aloha_data_to_lerobot.py](./convert_aloha_data_to_lerobot.py) that converts the dataset to the LeRobot dataset v2.0 format. As an example we have converted the `aloha_pen_uncap_diverse_raw` dataset from the [BiPlay repo](https://huggingface.co/datasets/oier-mees/BiPlay/tree/main/aloha_pen_uncap_diverse_raw) and uploaded it to the HuggingFace Hub as [physical-intelligence/aloha_pen_uncap_diverse](https://huggingface.co/datasets/physical-intelligence/aloha_pen_uncap_diverse).
+2. Define a training config that uses the custom dataset.
+    We provide the [pi0_aloha_pen_uncap config](../../src/openpi/training/config.py) as an example. You should refer to the root [README](../../README.md) for how to run training with the new config.
+IMPORTANT: Our base checkpoint includes normalization stats from various common robot configurations. When fine-tuning a base checkpoint with a custom dataset from one of these configurations, we recommend using the corresponding normalization stats provided in the base checkpoint. In the example, this is done by specifying the trossen asset_id and a path to the pretrained checkpoint’s asset directory within the AssetsConfig.

capvector-pi05/examples/aloha_real/compose.yml ADDED Viewed

	@@ -0,0 +1,66 @@

+# Run with:
+# docker compose -f examples/aloha_real/compose.yml up --build
+services:
+  runtime:
+    image: aloha_real
+    depends_on:
+      - aloha_ros_nodes
+      - ros_master
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/aloha_real/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - $PWD:/app
+      - ../../data:/data
+  aloha_ros_nodes:
+    image: aloha_real
+    depends_on:
+      - ros_master
+    build:
+      context: ../..
+      dockerfile: examples/aloha_real/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - /dev:/dev
+    command: roslaunch --wait aloha ros_nodes.launch
+  ros_master:
+    image: ros:noetic-robot
+    network_mode: host
+    privileged: true
+    command:
+      - roscore
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

capvector-pi05/examples/aloha_real/constants.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+### Task parameters
+### ALOHA fixed constants
+DT = 0.001
+JOINT_NAMES = ["waist", "shoulder", "elbow", "forearm_roll", "wrist_angle", "wrist_rotate"]
+START_ARM_POSE = [0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239, 0, -0.96, 1.16, 0, -0.3, 0, 0.02239, -0.02239]
+# Left finger position limits (qpos[7]), right_finger = -1 * left_finger
+MASTER_GRIPPER_POSITION_OPEN = 0.02417
+MASTER_GRIPPER_POSITION_CLOSE = 0.01244
+PUPPET_GRIPPER_POSITION_OPEN = 0.05800
+PUPPET_GRIPPER_POSITION_CLOSE = 0.01844
+# Gripper joint limits (qpos[6])
+MASTER_GRIPPER_JOINT_OPEN = 0.3083
+MASTER_GRIPPER_JOINT_CLOSE = -0.6842
+PUPPET_GRIPPER_JOINT_OPEN = 1.4910
+PUPPET_GRIPPER_JOINT_CLOSE = -0.6213
+############################ Helper functions ############################
+MASTER_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_POSITION_CLOSE) / (
+    MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE
+)
+PUPPET_GRIPPER_POSITION_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_POSITION_CLOSE) / (
+    PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE
+)
+MASTER_GRIPPER_POSITION_UNNORMALIZE_FN = (
+    lambda x: x * (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE) + MASTER_GRIPPER_POSITION_CLOSE
+)
+PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN = (
+    lambda x: x * (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE) + PUPPET_GRIPPER_POSITION_CLOSE
+)
+MASTER2PUPPET_POSITION_FN = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(MASTER_GRIPPER_POSITION_NORMALIZE_FN(x))
+MASTER_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - MASTER_GRIPPER_JOINT_CLOSE) / (
+    MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE
+)
+PUPPET_GRIPPER_JOINT_NORMALIZE_FN = lambda x: (x - PUPPET_GRIPPER_JOINT_CLOSE) / (
+    PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE
+)
+MASTER_GRIPPER_JOINT_UNNORMALIZE_FN = (
+    lambda x: x * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE) + MASTER_GRIPPER_JOINT_CLOSE
+)
+PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN = (
+    lambda x: x * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE) + PUPPET_GRIPPER_JOINT_CLOSE
+)
+MASTER2PUPPET_JOINT_FN = lambda x: PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(MASTER_GRIPPER_JOINT_NORMALIZE_FN(x))
+MASTER_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (MASTER_GRIPPER_POSITION_OPEN - MASTER_GRIPPER_POSITION_CLOSE)
+PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN = lambda x: x / (PUPPET_GRIPPER_POSITION_OPEN - PUPPET_GRIPPER_POSITION_CLOSE)
+MASTER_POS2JOINT = (
+    lambda x: MASTER_GRIPPER_POSITION_NORMALIZE_FN(x) * (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+    + MASTER_GRIPPER_JOINT_CLOSE
+)
+MASTER_JOINT2POS = lambda x: MASTER_GRIPPER_POSITION_UNNORMALIZE_FN(
+    (x - MASTER_GRIPPER_JOINT_CLOSE) / (MASTER_GRIPPER_JOINT_OPEN - MASTER_GRIPPER_JOINT_CLOSE)
+)
+PUPPET_POS2JOINT = (
+    lambda x: PUPPET_GRIPPER_POSITION_NORMALIZE_FN(x) * (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+    + PUPPET_GRIPPER_JOINT_CLOSE
+)
+PUPPET_JOINT2POS = lambda x: PUPPET_GRIPPER_POSITION_UNNORMALIZE_FN(
+    (x - PUPPET_GRIPPER_JOINT_CLOSE) / (PUPPET_GRIPPER_JOINT_OPEN - PUPPET_GRIPPER_JOINT_CLOSE)
+)
+MASTER_GRIPPER_JOINT_MID = (MASTER_GRIPPER_JOINT_OPEN + MASTER_GRIPPER_JOINT_CLOSE) / 2

capvector-pi05/examples/aloha_real/convert_aloha_data_to_lerobot.py ADDED Viewed

	@@ -0,0 +1,263 @@

+"""
+Script to convert Aloha hdf5 data to the LeRobot dataset v2.0 format.
+Example usage: uv run examples/aloha_real/convert_aloha_data_to_lerobot.py --raw-dir /path/to/raw/data --repo-id <org>/<dataset-name>
+"""
+import dataclasses
+from pathlib import Path
+import shutil
+from typing import Literal
+import h5py
+from lerobot.common.constants import HF_LEROBOT_HOME
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+import numpy as np
+import torch
+import tqdm
+import tyro
+@dataclasses.dataclass(frozen=True)
+class DatasetConfig:
+    use_videos: bool = True
+    tolerance_s: float = 0.0001
+    image_writer_processes: int = 10
+    image_writer_threads: int = 5
+    video_backend: str | None = None
+DEFAULT_DATASET_CONFIG = DatasetConfig()
+def create_empty_dataset(
+    repo_id: str,
+    robot_type: str,
+    cameras: list[str],
+    mode: Literal["video", "image"] = "video",
+    *,
+    has_velocity: bool = False,
+    has_effort: bool = False,
+    dataset_config: DatasetConfig = DEFAULT_DATASET_CONFIG,
+) -> LeRobotDataset:
+    motors = [
+        "right_waist",
+        "right_shoulder",
+        "right_elbow",
+        "right_forearm_roll",
+        "right_wrist_angle",
+        "right_wrist_rotate",
+        "right_gripper",
+        "left_waist",
+        "left_shoulder",
+        "left_elbow",
+        "left_forearm_roll",
+        "left_wrist_angle",
+        "left_wrist_rotate",
+        "left_gripper",
+    ]
+    features = {
+        "observation.state": {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        },
+        "action": {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        },
+    }
+    if has_velocity:
+        features["observation.velocity"] = {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        }
+    if has_effort:
+        features["observation.effort"] = {
+            "dtype": "float32",
+            "shape": (len(motors),),
+            "names": [
+                motors,
+            ],
+        }
+    for cam in cameras:
+        features[f"observation.images.{cam}"] = {
+            "dtype": mode,
+            "shape": (3, 480, 640),
+            "names": [
+                "channels",
+                "height",
+                "width",
+            ],
+        }
+    if Path(HF_LEROBOT_HOME / repo_id).exists():
+        shutil.rmtree(HF_LEROBOT_HOME / repo_id)
+    return LeRobotDataset.create(
+        repo_id=repo_id,
+        fps=50,
+        robot_type=robot_type,
+        features=features,
+        use_videos=dataset_config.use_videos,
+        tolerance_s=dataset_config.tolerance_s,
+        image_writer_processes=dataset_config.image_writer_processes,
+        image_writer_threads=dataset_config.image_writer_threads,
+        video_backend=dataset_config.video_backend,
+    )
+def get_cameras(hdf5_files: list[Path]) -> list[str]:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        # ignore depth channel, not currently handled
+        return [key for key in ep["/observations/images"].keys() if "depth" not in key]  # noqa: SIM118
+def has_velocity(hdf5_files: list[Path]) -> bool:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        return "/observations/qvel" in ep
+def has_effort(hdf5_files: list[Path]) -> bool:
+    with h5py.File(hdf5_files[0], "r") as ep:
+        return "/observations/effort" in ep
+def load_raw_images_per_camera(ep: h5py.File, cameras: list[str]) -> dict[str, np.ndarray]:
+    imgs_per_cam = {}
+    for camera in cameras:
+        uncompressed = ep[f"/observations/images/{camera}"].ndim == 4
+        if uncompressed:
+            # load all images in RAM
+            imgs_array = ep[f"/observations/images/{camera}"][:]
+        else:
+            import cv2
+            # load one compressed image after the other in RAM and uncompress
+            imgs_array = []
+            for data in ep[f"/observations/images/{camera}"]:
+                imgs_array.append(cv2.cvtColor(cv2.imdecode(data, 1), cv2.COLOR_BGR2RGB))
+            imgs_array = np.array(imgs_array)
+        imgs_per_cam[camera] = imgs_array
+    return imgs_per_cam
+def load_raw_episode_data(
+    ep_path: Path,
+    cameras: list[str],
+) -> tuple[dict[str, np.ndarray], torch.Tensor, torch.Tensor, torch.Tensor | None, torch.Tensor | None]:
+    with h5py.File(ep_path, "r") as ep:
+        state = torch.from_numpy(ep["/observations/qpos"][:])
+        action = torch.from_numpy(ep["/action"][:])
+        velocity = None
+        if "/observations/qvel" in ep:
+            velocity = torch.from_numpy(ep["/observations/qvel"][:])
+        effort = None
+        if "/observations/effort" in ep:
+            effort = torch.from_numpy(ep["/observations/effort"][:])
+        imgs_per_cam = load_raw_images_per_camera(ep, cameras)
+    return imgs_per_cam, state, action, velocity, effort
+def populate_dataset(
+    dataset: LeRobotDataset,
+    hdf5_files: list[Path],
+    cameras: list[str],
+    task: str,
+    episodes: list[int] | None = None,
+) -> LeRobotDataset:
+    if episodes is None:
+        episodes = range(len(hdf5_files))
+    for ep_idx in tqdm.tqdm(episodes):
+        ep_path = hdf5_files[ep_idx]
+        imgs_per_cam, state, action, velocity, effort = load_raw_episode_data(ep_path, cameras)
+        num_frames = state.shape[0]
+        for i in range(num_frames):
+            frame = {
+                "observation.state": state[i],
+                "action": action[i],
+                "task": task,
+            }
+            for camera, img_array in imgs_per_cam.items():
+                frame[f"observation.images.{camera}"] = img_array[i]
+            if velocity is not None:
+                frame["observation.velocity"] = velocity[i]
+            if effort is not None:
+                frame["observation.effort"] = effort[i]
+            dataset.add_frame(frame)
+        dataset.save_episode()
+    return dataset
+def port_aloha(
+    raw_dir: Path,
+    repo_id: str,
+    task: str = "DEBUG",
+    *,
+    episodes: list[int] | None = None,
+    push_to_hub: bool = False,
+    is_mobile: bool = False,
+    mode: Literal["video", "image"] = "image",
+    dataset_config: DatasetConfig = DEFAULT_DATASET_CONFIG,
+):
+    if (HF_LEROBOT_HOME / repo_id).exists():
+        shutil.rmtree(HF_LEROBOT_HOME / repo_id)
+    if not raw_dir.exists():
+        raise ValueError(f"Raw directory {raw_dir} does not exist. Please provide a valid path to the raw data.")
+    hdf5_files = sorted(raw_dir.glob("episode_*.hdf5"))
+    # Get camera names from the first episode
+    cameras = get_cameras(hdf5_files)
+    print(f"Detected cameras: {cameras}")
+    dataset = create_empty_dataset(
+        repo_id,
+        robot_type="mobile_aloha" if is_mobile else "aloha",
+        cameras=cameras,
+        mode=mode,
+        has_effort=has_effort(hdf5_files),
+        has_velocity=has_velocity(hdf5_files),
+        dataset_config=dataset_config,
+    )
+    dataset = populate_dataset(
+        dataset,
+        hdf5_files,
+        cameras=cameras,
+        task=task,
+        episodes=episodes,
+    )
+    if push_to_hub:
+        dataset.push_to_hub()
+if __name__ == "__main__":
+    tyro.cli(port_aloha)

capvector-pi05/examples/aloha_real/env.py ADDED Viewed

	@@ -0,0 +1,57 @@

+from typing import List, Optional  # noqa: UP035
+import einops
+from openpi_client import image_tools
+from openpi_client.runtime import environment as _environment
+from typing_extensions import override
+from examples.aloha_real import real_env as _real_env
+class AlohaRealEnvironment(_environment.Environment):
+    """An environment for an Aloha robot on real hardware."""
+    def __init__(
+        self,
+        reset_position: Optional[List[float]] = None,  # noqa: UP006,UP007
+        render_height: int = 224,
+        render_width: int = 224,
+    ) -> None:
+        self._env = _real_env.make_real_env(init_node=True, reset_position=reset_position)
+        self._render_height = render_height
+        self._render_width = render_width
+        self._ts = None
+    @override
+    def reset(self) -> None:
+        self._ts = self._env.reset()
+    @override
+    def is_episode_complete(self) -> bool:
+        return False
+    @override
+    def get_observation(self) -> dict:
+        if self._ts is None:
+            raise RuntimeError("Timestep is not set. Call reset() first.")
+        obs = self._ts.observation
+        for k in list(obs["images"].keys()):
+            if "_depth" in k:
+                del obs["images"][k]
+        for cam_name in obs["images"]:
+            img = image_tools.convert_to_uint8(
+                image_tools.resize_with_pad(obs["images"][cam_name], self._render_height, self._render_width)
+            )
+            obs["images"][cam_name] = einops.rearrange(img, "h w c -> c h w")
+        return {
+            "state": obs["qpos"],
+            "images": obs["images"],
+        }
+    @override
+    def apply_action(self, action: dict) -> None:
+        self._ts = self._env.step(action["actions"])

capvector-pi05/examples/aloha_real/main.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import dataclasses
+import logging
+from openpi_client import action_chunk_broker
+from openpi_client import websocket_client_policy as _websocket_client_policy
+from openpi_client.runtime import runtime as _runtime
+from openpi_client.runtime.agents import policy_agent as _policy_agent
+import tyro
+from examples.aloha_real import env as _env
+@dataclasses.dataclass
+class Args:
+    host: str = "0.0.0.0"
+    port: int = 8000
+    action_horizon: int = 25
+    num_episodes: int = 1
+    max_episode_steps: int = 1000
+def main(args: Args) -> None:
+    ws_client_policy = _websocket_client_policy.WebsocketClientPolicy(
+        host=args.host,
+        port=args.port,
+    )
+    logging.info(f"Server metadata: {ws_client_policy.get_server_metadata()}")
+    metadata = ws_client_policy.get_server_metadata()
+    runtime = _runtime.Runtime(
+        environment=_env.AlohaRealEnvironment(reset_position=metadata.get("reset_pose")),
+        agent=_policy_agent.PolicyAgent(
+            policy=action_chunk_broker.ActionChunkBroker(
+                policy=ws_client_policy,
+                action_horizon=args.action_horizon,
+            )
+        ),
+        subscribers=[],
+        max_hz=50,
+        num_episodes=args.num_episodes,
+        max_episode_steps=args.max_episode_steps,
+    )
+    runtime.run()
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    tyro.cli(main)

capvector-pi05/examples/aloha_real/real_env.py ADDED Viewed

	@@ -0,0 +1,176 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+import collections
+import time
+from typing import Optional, List
+import dm_env
+from interbotix_xs_modules.arm import InterbotixManipulatorXS
+from interbotix_xs_msgs.msg import JointSingleCommand
+import numpy as np
+from examples.aloha_real import constants
+from examples.aloha_real import robot_utils
+# This is the reset position that is used by the standard Aloha runtime.
+DEFAULT_RESET_POSITION = [0, -0.96, 1.16, 0, -0.3, 0]
+class RealEnv:
+    """
+    Environment for real robot bi-manual manipulation
+    Action space:      [left_arm_qpos (6),             # absolute joint position
+                        left_gripper_positions (1),    # normalized gripper position (0: close, 1: open)
+                        right_arm_qpos (6),            # absolute joint position
+                        right_gripper_positions (1),]  # normalized gripper position (0: close, 1: open)
+    Observation space: {"qpos": Concat[ left_arm_qpos (6),          # absolute joint position
+                                        left_gripper_position (1),  # normalized gripper position (0: close, 1: open)
+                                        right_arm_qpos (6),         # absolute joint position
+                                        right_gripper_qpos (1)]     # normalized gripper position (0: close, 1: open)
+                        "qvel": Concat[ left_arm_qvel (6),         # absolute joint velocity (rad)
+                                        left_gripper_velocity (1),  # normalized gripper velocity (pos: opening, neg: closing)
+                                        right_arm_qvel (6),         # absolute joint velocity (rad)
+                                        right_gripper_qvel (1)]     # normalized gripper velocity (pos: opening, neg: closing)
+                        "images": {"cam_high": (480x640x3),        # h, w, c, dtype='uint8'
+                                   "cam_low": (480x640x3),         # h, w, c, dtype='uint8'
+                                   "cam_left_wrist": (480x640x3),  # h, w, c, dtype='uint8'
+                                   "cam_right_wrist": (480x640x3)} # h, w, c, dtype='uint8'
+    """
+    def __init__(self, init_node, *, reset_position: Optional[List[float]] = None, setup_robots: bool = True):
+        # reset_position = START_ARM_POSE[:6]
+        self._reset_position = reset_position[:6] if reset_position else DEFAULT_RESET_POSITION
+        self.puppet_bot_left = InterbotixManipulatorXS(
+            robot_model="vx300s",
+            group_name="arm",
+            gripper_name="gripper",
+            robot_name="puppet_left",
+            init_node=init_node,
+        )
+        self.puppet_bot_right = InterbotixManipulatorXS(
+            robot_model="vx300s", group_name="arm", gripper_name="gripper", robot_name="puppet_right", init_node=False
+        )
+        if setup_robots:
+            self.setup_robots()
+        self.recorder_left = robot_utils.Recorder("left", init_node=False)
+        self.recorder_right = robot_utils.Recorder("right", init_node=False)
+        self.image_recorder = robot_utils.ImageRecorder(init_node=False)
+        self.gripper_command = JointSingleCommand(name="gripper")
+    def setup_robots(self):
+        robot_utils.setup_puppet_bot(self.puppet_bot_left)
+        robot_utils.setup_puppet_bot(self.puppet_bot_right)
+    def get_qpos(self):
+        left_qpos_raw = self.recorder_left.qpos
+        right_qpos_raw = self.recorder_right.qpos
+        left_arm_qpos = left_qpos_raw[:6]
+        right_arm_qpos = right_qpos_raw[:6]
+        left_gripper_qpos = [
+            constants.PUPPET_GRIPPER_POSITION_NORMALIZE_FN(left_qpos_raw[7])
+        ]  # this is position not joint
+        right_gripper_qpos = [
+            constants.PUPPET_GRIPPER_POSITION_NORMALIZE_FN(right_qpos_raw[7])
+        ]  # this is position not joint
+        return np.concatenate([left_arm_qpos, left_gripper_qpos, right_arm_qpos, right_gripper_qpos])
+    def get_qvel(self):
+        left_qvel_raw = self.recorder_left.qvel
+        right_qvel_raw = self.recorder_right.qvel
+        left_arm_qvel = left_qvel_raw[:6]
+        right_arm_qvel = right_qvel_raw[:6]
+        left_gripper_qvel = [constants.PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(left_qvel_raw[7])]
+        right_gripper_qvel = [constants.PUPPET_GRIPPER_VELOCITY_NORMALIZE_FN(right_qvel_raw[7])]
+        return np.concatenate([left_arm_qvel, left_gripper_qvel, right_arm_qvel, right_gripper_qvel])
+    def get_effort(self):
+        left_effort_raw = self.recorder_left.effort
+        right_effort_raw = self.recorder_right.effort
+        left_robot_effort = left_effort_raw[:7]
+        right_robot_effort = right_effort_raw[:7]
+        return np.concatenate([left_robot_effort, right_robot_effort])
+    def get_images(self):
+        return self.image_recorder.get_images()
+    def set_gripper_pose(self, left_gripper_desired_pos_normalized, right_gripper_desired_pos_normalized):
+        left_gripper_desired_joint = constants.PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(left_gripper_desired_pos_normalized)
+        self.gripper_command.cmd = left_gripper_desired_joint
+        self.puppet_bot_left.gripper.core.pub_single.publish(self.gripper_command)
+        right_gripper_desired_joint = constants.PUPPET_GRIPPER_JOINT_UNNORMALIZE_FN(
+            right_gripper_desired_pos_normalized
+        )
+        self.gripper_command.cmd = right_gripper_desired_joint
+        self.puppet_bot_right.gripper.core.pub_single.publish(self.gripper_command)
+    def _reset_joints(self):
+        robot_utils.move_arms(
+            [self.puppet_bot_left, self.puppet_bot_right], [self._reset_position, self._reset_position], move_time=1
+        )
+    def _reset_gripper(self):
+        """Set to position mode and do position resets: first close then open. Then change back to PWM mode
+        NOTE: This diverges from the original Aloha code which first opens then closes the gripper. Pi internal aloha data
+        was collected with the gripper starting in the open position. Leaving the grippers fully closed was also found to
+        increase the frequency of motor faults.
+        """
+        robot_utils.move_grippers(
+            [self.puppet_bot_left, self.puppet_bot_right], [constants.PUPPET_GRIPPER_JOINT_CLOSE] * 2, move_time=1
+        )
+        robot_utils.move_grippers(
+            [self.puppet_bot_left, self.puppet_bot_right], [constants.PUPPET_GRIPPER_JOINT_OPEN] * 2, move_time=0.5
+        )
+    def get_observation(self):
+        obs = collections.OrderedDict()
+        obs["qpos"] = self.get_qpos()
+        obs["qvel"] = self.get_qvel()
+        obs["effort"] = self.get_effort()
+        obs["images"] = self.get_images()
+        return obs
+    def get_reward(self):
+        return 0
+    def reset(self, *, fake=False):
+        if not fake:
+            # Reboot puppet robot gripper motors
+            self.puppet_bot_left.dxl.robot_reboot_motors("single", "gripper", True)
+            self.puppet_bot_right.dxl.robot_reboot_motors("single", "gripper", True)
+            self._reset_joints()
+            self._reset_gripper()
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.FIRST, reward=self.get_reward(), discount=None, observation=self.get_observation()
+        )
+    def step(self, action):
+        state_len = int(len(action) / 2)
+        left_action = action[:state_len]
+        right_action = action[state_len:]
+        self.puppet_bot_left.arm.set_joint_positions(left_action[:6], blocking=False)
+        self.puppet_bot_right.arm.set_joint_positions(right_action[:6], blocking=False)
+        self.set_gripper_pose(left_action[-1], right_action[-1])
+        time.sleep(constants.DT)
+        return dm_env.TimeStep(
+            step_type=dm_env.StepType.MID, reward=self.get_reward(), discount=None, observation=self.get_observation()
+        )
+def get_action(master_bot_left, master_bot_right):
+    action = np.zeros(14)  # 6 joint + 1 gripper, for two arms
+    # Arm actions
+    action[:6] = master_bot_left.dxl.joint_states.position[:6]
+    action[7 : 7 + 6] = master_bot_right.dxl.joint_states.position[:6]
+    # Gripper actions
+    action[6] = constants.MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_left.dxl.joint_states.position[6])
+    action[7 + 6] = constants.MASTER_GRIPPER_JOINT_NORMALIZE_FN(master_bot_right.dxl.joint_states.position[6])
+    return action
+def make_real_env(init_node, *, reset_position: Optional[List[float]] = None, setup_robots: bool = True) -> RealEnv:
+    return RealEnv(init_node, reset_position=reset_position, setup_robots=setup_robots)

capvector-pi05/examples/aloha_real/requirements.in ADDED Viewed

	@@ -0,0 +1,18 @@

+Pillow
+dm_control
+einops
+h5py
+matplotlib
+modern_robotics
+msgpack
+numpy>=1.22.4,<2.0.0
+opencv-python
+packaging
+pexpect
+pyquaternion
+pyrealsense2
+pyyaml
+requests
+rospkg
+tyro
+websockets

capvector-pi05/examples/aloha_real/requirements.txt ADDED Viewed

	@@ -0,0 +1,156 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/aloha_real/requirements.in -o examples/aloha_real/requirements.txt --python-version 3.10
+absl-py==2.1.0
+    # via
+    #   dm-control
+    #   dm-env
+    #   labmaze
+    #   mujoco
+catkin-pkg==1.0.0
+    # via rospkg
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+contourpy==1.1.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+distro==1.9.0
+    # via rospkg
+dm-control==1.0.23
+    # via -r examples/aloha_real/requirements.in
+dm-env==1.6
+    # via dm-control
+dm-tree==0.1.8
+    # via
+    #   dm-control
+    #   dm-env
+docstring-parser==0.16
+    # via tyro
+docutils==0.20.1
+    # via catkin-pkg
+einops==0.8.0
+    # via -r examples/aloha_real/requirements.in
+etils==1.3.0
+    # via mujoco
+fonttools==4.55.2
+    # via matplotlib
+glfw==2.8.0
+    # via
+    #   dm-control
+    #   mujoco
+h5py==3.11.0
+    # via -r examples/aloha_real/requirements.in
+idna==3.10
+    # via requests
+importlib-resources==6.4.5
+    # via etils
+kiwisolver==1.4.7
+    # via matplotlib
+labmaze==1.0.6
+    # via dm-control
+lxml==5.3.0
+    # via dm-control
+markdown-it-py==3.0.0
+    # via rich
+matplotlib==3.7.5
+    # via -r examples/aloha_real/requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
+modern-robotics==1.1.1
+    # via -r examples/aloha_real/requirements.in
+msgpack==1.1.0
+    # via -r examples/aloha_real/requirements.in
+mujoco==3.2.3
+    # via dm-control
+numpy==1.24.4
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   contourpy
+    #   dm-control
+    #   dm-env
+    #   h5py
+    #   labmaze
+    #   matplotlib
+    #   modern-robotics
+    #   mujoco
+    #   opencv-python
+    #   pyquaternion
+    #   scipy
+opencv-python==4.10.0.84
+    # via -r examples/aloha_real/requirements.in
+packaging==24.2
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   matplotlib
+pexpect==4.9.0
+    # via -r examples/aloha_real/requirements.in
+pillow==10.4.0
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   matplotlib
+protobuf==5.29.1
+    # via dm-control
+ptyprocess==0.7.0
+    # via pexpect
+pygments==2.18.0
+    # via rich
+pyopengl==3.1.7
+    # via
+    #   dm-control
+    #   mujoco
+pyparsing==3.1.4
+    # via
+    #   catkin-pkg
+    #   dm-control
+    #   matplotlib
+pyquaternion==0.9.9
+    # via -r examples/aloha_real/requirements.in
+pyrealsense2==2.55.1.6486
+    # via -r examples/aloha_real/requirements.in
+python-dateutil==2.9.0.post0
+    # via
+    #   catkin-pkg
+    #   matplotlib
+pyyaml==6.0.2
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   rospkg
+requests==2.32.3
+    # via
+    #   -r examples/aloha_real/requirements.in
+    #   dm-control
+rich==13.9.4
+    # via tyro
+rospkg==1.5.1
+    # via -r examples/aloha_real/requirements.in
+scipy==1.10.1
+    # via dm-control
+setuptools==75.3.0
+    # via
+    #   catkin-pkg
+    #   dm-control
+    #   labmaze
+shtab==1.7.1
+    # via tyro
+six==1.17.0
+    # via python-dateutil
+tqdm==4.67.1
+    # via dm-control
+typeguard==4.4.0
+    # via tyro
+typing-extensions==4.12.2
+    # via
+    #   etils
+    #   rich
+    #   typeguard
+    #   tyro
+tyro==0.9.2
+    # via -r examples/aloha_real/requirements.in
+urllib3==2.2.3
+    # via requests
+websockets==14.1
+    # via -r examples/aloha_real/requirements.in
+zipp==3.20.2
+    # via etils

capvector-pi05/examples/aloha_real/robot_utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+# Ignore lint errors because this file is mostly copied from ACT (https://github.com/tonyzhaozh/act).
+# ruff: noqa
+from collections import deque
+import datetime
+import json
+import time
+from aloha.msg import RGBGrayscaleImage
+from cv_bridge import CvBridge
+from interbotix_xs_msgs.msg import JointGroupCommand
+from interbotix_xs_msgs.msg import JointSingleCommand
+import numpy as np
+import rospy
+from sensor_msgs.msg import JointState
+from examples.aloha_real import constants
+class ImageRecorder:
+    def __init__(self, init_node=True, is_debug=False):
+        self.is_debug = is_debug
+        self.bridge = CvBridge()
+        self.camera_names = ["cam_high", "cam_low", "cam_left_wrist", "cam_right_wrist"]
+        if init_node:
+            rospy.init_node("image_recorder", anonymous=True)
+        for cam_name in self.camera_names:
+            setattr(self, f"{cam_name}_rgb_image", None)
+            setattr(self, f"{cam_name}_depth_image", None)
+            setattr(self, f"{cam_name}_timestamp", 0.0)
+            if cam_name == "cam_high":
+                callback_func = self.image_cb_cam_high
+            elif cam_name == "cam_low":
+                callback_func = self.image_cb_cam_low
+            elif cam_name == "cam_left_wrist":
+                callback_func = self.image_cb_cam_left_wrist
+            elif cam_name == "cam_right_wrist":
+                callback_func = self.image_cb_cam_right_wrist
+            else:
+                raise NotImplementedError
+            rospy.Subscriber(f"/{cam_name}", RGBGrayscaleImage, callback_func)
+            if self.is_debug:
+                setattr(self, f"{cam_name}_timestamps", deque(maxlen=50))
+        self.cam_last_timestamps = {cam_name: 0.0 for cam_name in self.camera_names}
+        time.sleep(0.5)
+    def image_cb(self, cam_name, data):
+        setattr(
+            self,
+            f"{cam_name}_rgb_image",
+            self.bridge.imgmsg_to_cv2(data.images[0], desired_encoding="bgr8"),
+        )
+        # setattr(
+        #     self,
+        #     f"{cam_name}_depth_image",
+        #     self.bridge.imgmsg_to_cv2(data.images[1], desired_encoding="mono16"),
+        # )
+        setattr(
+            self,
+            f"{cam_name}_timestamp",
+            data.header.stamp.secs + data.header.stamp.nsecs * 1e-9,
+        )
+        # setattr(self, f'{cam_name}_secs', data.images[0].header.stamp.secs)
+        # setattr(self, f'{cam_name}_nsecs', data.images[0].header.stamp.nsecs)
+        # cv2.imwrite('/home/lucyshi/Desktop/sample.jpg', cv_image)
+        if self.is_debug:
+            getattr(self, f"{cam_name}_timestamps").append(
+                data.images[0].header.stamp.secs + data.images[0].header.stamp.nsecs * 1e-9
+            )
+    def image_cb_cam_high(self, data):
+        cam_name = "cam_high"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_low(self, data):
+        cam_name = "cam_low"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_left_wrist(self, data):
+        cam_name = "cam_left_wrist"
+        return self.image_cb(cam_name, data)
+    def image_cb_cam_right_wrist(self, data):
+        cam_name = "cam_right_wrist"
+        return self.image_cb(cam_name, data)
+    def get_images(self):
+        image_dict = {}
+        for cam_name in self.camera_names:
+            while getattr(self, f"{cam_name}_timestamp") <= self.cam_last_timestamps[cam_name]:
+                time.sleep(0.00001)
+            rgb_image = getattr(self, f"{cam_name}_rgb_image")
+            depth_image = getattr(self, f"{cam_name}_depth_image")
+            self.cam_last_timestamps[cam_name] = getattr(self, f"{cam_name}_timestamp")
+            image_dict[cam_name] = rgb_image
+            image_dict[f"{cam_name}_depth"] = depth_image
+        return image_dict
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+        for cam_name in self.camera_names:
+            image_freq = 1 / dt_helper(getattr(self, f"{cam_name}_timestamps"))
+            print(f"{cam_name} {image_freq=:.2f}")
+        print()
+class Recorder:
+    def __init__(self, side, init_node=True, is_debug=False):
+        self.secs = None
+        self.nsecs = None
+        self.qpos = None
+        self.effort = None
+        self.arm_command = None
+        self.gripper_command = None
+        self.is_debug = is_debug
+        if init_node:
+            rospy.init_node("recorder", anonymous=True)
+        rospy.Subscriber(f"/puppet_{side}/joint_states", JointState, self.puppet_state_cb)
+        rospy.Subscriber(
+            f"/puppet_{side}/commands/joint_group",
+            JointGroupCommand,
+            self.puppet_arm_commands_cb,
+        )
+        rospy.Subscriber(
+            f"/puppet_{side}/commands/joint_single",
+            JointSingleCommand,
+            self.puppet_gripper_commands_cb,
+        )
+        if self.is_debug:
+            self.joint_timestamps = deque(maxlen=50)
+            self.arm_command_timestamps = deque(maxlen=50)
+            self.gripper_command_timestamps = deque(maxlen=50)
+        time.sleep(0.1)
+    def puppet_state_cb(self, data):
+        self.qpos = data.position
+        self.qvel = data.velocity
+        self.effort = data.effort
+        self.data = data
+        if self.is_debug:
+            self.joint_timestamps.append(time.time())
+    def puppet_arm_commands_cb(self, data):
+        self.arm_command = data.cmd
+        if self.is_debug:
+            self.arm_command_timestamps.append(time.time())
+    def puppet_gripper_commands_cb(self, data):
+        self.gripper_command = data.cmd
+        if self.is_debug:
+            self.gripper_command_timestamps.append(time.time())
+    def print_diagnostics(self):
+        def dt_helper(l):
+            l = np.array(l)
+            diff = l[1:] - l[:-1]
+            return np.mean(diff)
+        joint_freq = 1 / dt_helper(self.joint_timestamps)
+        arm_command_freq = 1 / dt_helper(self.arm_command_timestamps)
+        gripper_command_freq = 1 / dt_helper(self.gripper_command_timestamps)
+        print(f"{joint_freq=:.2f}\n{arm_command_freq=:.2f}\n{gripper_command_freq=:.2f}\n")
+def get_arm_joint_positions(bot):
+    return bot.arm.core.joint_states.position[:6]
+def get_arm_gripper_positions(bot):
+    return bot.gripper.core.joint_states.position[6]
+def move_arms(bot_list, target_pose_list, move_time=1):
+    num_steps = int(move_time / constants.DT)
+    curr_pose_list = [get_arm_joint_positions(bot) for bot in bot_list]
+    traj_list = [
+        np.linspace(curr_pose, target_pose, num_steps)
+        for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)
+    ]
+    for t in range(num_steps):
+        for bot_id, bot in enumerate(bot_list):
+            bot.arm.set_joint_positions(traj_list[bot_id][t], blocking=False)
+        time.sleep(constants.DT)
+def move_grippers(bot_list, target_pose_list, move_time):
+    print(f"Moving grippers to {target_pose_list=}")
+    gripper_command = JointSingleCommand(name="gripper")
+    num_steps = int(move_time / constants.DT)
+    curr_pose_list = [get_arm_gripper_positions(bot) for bot in bot_list]
+    traj_list = [
+        np.linspace(curr_pose, target_pose, num_steps)
+        for curr_pose, target_pose in zip(curr_pose_list, target_pose_list)
+    ]
+    with open(f"/data/gripper_traj_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}.jsonl", "a") as f:
+        for t in range(num_steps):
+            d = {}
+            for bot_id, bot in enumerate(bot_list):
+                gripper_command.cmd = traj_list[bot_id][t]
+                bot.gripper.core.pub_single.publish(gripper_command)
+                d[bot_id] = {"obs": get_arm_gripper_positions(bot), "act": traj_list[bot_id][t]}
+            f.write(json.dumps(d) + "\n")
+            time.sleep(constants.DT)
+def setup_puppet_bot(bot):
+    bot.dxl.robot_reboot_motors("single", "gripper", True)
+    bot.dxl.robot_set_operating_modes("group", "arm", "position")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_on(bot)
+def setup_master_bot(bot):
+    bot.dxl.robot_set_operating_modes("group", "arm", "pwm")
+    bot.dxl.robot_set_operating_modes("single", "gripper", "current_based_position")
+    torque_off(bot)
+def set_standard_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_P_Gain", 800)
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_I_Gain", 0)
+def set_low_pid_gains(bot):
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_P_Gain", 100)
+    bot.dxl.robot_set_motor_registers("group", "arm", "Position_I_Gain", 0)
+def torque_off(bot):
+    bot.dxl.robot_torque_enable("group", "arm", False)
+    bot.dxl.robot_torque_enable("single", "gripper", False)
+def torque_on(bot):
+    bot.dxl.robot_torque_enable("group", "arm", True)
+    bot.dxl.robot_torque_enable("single", "gripper", True)
+# for DAgger
+def sync_puppet_to_master(master_bot_left, master_bot_right, puppet_bot_left, puppet_bot_right):
+    print("\nSyncing!")
+    # activate master arms
+    torque_on(master_bot_left)
+    torque_on(master_bot_right)
+    # get puppet arm positions
+    puppet_left_qpos = get_arm_joint_positions(puppet_bot_left)
+    puppet_right_qpos = get_arm_joint_positions(puppet_bot_right)
+    # get puppet gripper positions
+    puppet_left_gripper = get_arm_gripper_positions(puppet_bot_left)
+    puppet_right_gripper = get_arm_gripper_positions(puppet_bot_right)
+    # move master arms to puppet positions
+    move_arms(
+        [master_bot_left, master_bot_right],
+        [puppet_left_qpos, puppet_right_qpos],
+        move_time=1,
+    )
+    # move master grippers to puppet positions
+    move_grippers(
+        [master_bot_left, master_bot_right],
+        [puppet_left_gripper, puppet_right_gripper],
+        move_time=1,
+    )

capvector-pi05/examples/aloha_real/video_display.py ADDED Viewed

	@@ -0,0 +1,36 @@

+import matplotlib.pyplot as plt
+import numpy as np
+from openpi_client.runtime import subscriber as _subscriber
+from typing_extensions import override
+class VideoDisplay(_subscriber.Subscriber):
+    """Displays video frames."""
+    def __init__(self) -> None:
+        self._ax: plt.Axes | None = None
+        self._plt_img: plt.Image | None = None
+    @override
+    def on_episode_start(self) -> None:
+        plt.ion()
+        self._ax = plt.subplot()
+        self._plt_img = None
+    @override
+    def on_step(self, observation: dict, action: dict) -> None:
+        assert self._ax is not None
+        im = observation["image"][0]  # [C, H, W]
+        im = np.transpose(im, (1, 2, 0))  # [H, W, C]
+        if self._plt_img is None:
+            self._plt_img = self._ax.imshow(im)
+        else:
+            self._plt_img.set_data(im)
+        plt.pause(0.001)
+    @override
+    def on_episode_end(self) -> None:
+        plt.ioff()
+        plt.close()

capvector-pi05/examples/aloha_sim/Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+# Dockerfile for the Aloha simulation environment.
+# Build the container:
+# docker build . -t aloha_sim -f examples/aloha_sim/Dockerfile
+# Run the container:
+# docker run --rm -it --network=host -v .:/app aloha_sim /bin/bash
+FROM python:3.11-slim@sha256:370c586a6ffc8c619e6d652f81c094b34b14b8f2fb9251f092de23f16e299b78
+COPY --from=ghcr.io/astral-sh/uv:0.5.1 /uv /uvx /bin/
+RUN apt-get update && \
+    apt-get install -y \
+    libosmesa6-dev \
+    libgl1-mesa-glx \
+    libglew-dev \
+    libglfw3-dev \
+    libgles2-mesa-dev
+ENV MUJOCO_GL=egl
+WORKDIR /app
+# Copy from the cache instead of linking since it's a mounted volume
+ENV UV_LINK_MODE=copy
+# Write the virtual environment outside of the project directory so it doesn't
+# leak out of the container when we mount the application code.
+ENV UV_PROJECT_ENVIRONMENT=/.venv
+# Copy the requirements files so we can install dependencies.
+# The rest of the project is mounted as a volume, so we don't need to rebuild on changes.
+# This strategy is best for development-style usage.
+COPY ./examples/aloha_sim/requirements.txt /tmp/requirements.txt
+COPY ./packages/openpi-client/pyproject.toml /tmp/openpi-client/pyproject.toml
+# Install python dependencies.
+RUN uv venv --python 3.11.9 $UV_PROJECT_ENVIRONMENT
+RUN uv pip sync /tmp/requirements.txt /tmp/openpi-client/pyproject.toml
+ENV PYTHONPATH=/app:/app/src:/app/packages/openpi-client/src
+CMD ["/bin/bash", "-c", "source /.venv/bin/activate && python examples/aloha_sim/main.py"]

capvector-pi05/examples/aloha_sim/README.md ADDED Viewed

	@@ -0,0 +1,36 @@

+# Run Aloha Sim
+## With Docker
+```bash
+export SERVER_ARGS="--env ALOHA_SIM"
+docker compose -f examples/aloha_sim/compose.yml up --build
+```
+## Without Docker
+Terminal window 1:
+```bash
+# Create virtual environment
+uv venv --python 3.10 examples/aloha_sim/.venv
+source examples/aloha_sim/.venv/bin/activate
+uv pip sync examples/aloha_sim/requirements.txt
+uv pip install -e packages/openpi-client
+# Run the simulation
+MUJOCO_GL=egl python examples/aloha_sim/main.py
+```
+Note: If you are seeing EGL errors, you may need to install the following dependencies:
+```bash
+sudo apt-get install -y libegl1-mesa-dev libgles2-mesa-dev
+```
+Terminal window 2:
+```bash
+# Run the server
+uv run scripts/serve_policy.py --env ALOHA_SIM
+```

capvector-pi05/examples/aloha_sim/compose.yml ADDED Viewed

	@@ -0,0 +1,42 @@

+# Run with:
+# docker compose -f examples/aloha_sim/compose.yml up --build
+services:
+  runtime:
+    image: aloha_sim
+    depends_on:
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/aloha_sim/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - $PWD:/app
+      - ../../data:/data
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

capvector-pi05/examples/aloha_sim/env.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import gym_aloha  # noqa: F401
+import gymnasium
+import numpy as np
+from openpi_client import image_tools
+from openpi_client.runtime import environment as _environment
+from typing_extensions import override
+class AlohaSimEnvironment(_environment.Environment):
+    """An environment for an Aloha robot in simulation."""
+    def __init__(self, task: str, obs_type: str = "pixels_agent_pos", seed: int = 0) -> None:
+        np.random.seed(seed)
+        self._rng = np.random.default_rng(seed)
+        self._gym = gymnasium.make(task, obs_type=obs_type)
+        self._last_obs = None
+        self._done = True
+        self._episode_reward = 0.0
+    @override
+    def reset(self) -> None:
+        gym_obs, _ = self._gym.reset(seed=int(self._rng.integers(2**32 - 1)))
+        self._last_obs = self._convert_observation(gym_obs)  # type: ignore
+        self._done = False
+        self._episode_reward = 0.0
+    @override
+    def is_episode_complete(self) -> bool:
+        return self._done
+    @override
+    def get_observation(self) -> dict:
+        if self._last_obs is None:
+            raise RuntimeError("Observation is not set. Call reset() first.")
+        return self._last_obs  # type: ignore
+    @override
+    def apply_action(self, action: dict) -> None:
+        gym_obs, reward, terminated, truncated, info = self._gym.step(action["actions"])
+        self._last_obs = self._convert_observation(gym_obs)  # type: ignore
+        self._done = terminated or truncated
+        self._episode_reward = max(self._episode_reward, reward)
+    def _convert_observation(self, gym_obs: dict) -> dict:
+        img = gym_obs["pixels"]["top"]
+        img = image_tools.convert_to_uint8(image_tools.resize_with_pad(img, 224, 224))
+        # Convert axis order from [H, W, C] --> [C, H, W]
+        img = np.transpose(img, (2, 0, 1))
+        return {
+            "state": gym_obs["agent_pos"],
+            "images": {"cam_high": img},
+        }

capvector-pi05/examples/aloha_sim/main.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import dataclasses
+import logging
+import pathlib
+import env as _env
+from openpi_client import action_chunk_broker
+from openpi_client import websocket_client_policy as _websocket_client_policy
+from openpi_client.runtime import runtime as _runtime
+from openpi_client.runtime.agents import policy_agent as _policy_agent
+import saver as _saver
+import tyro
+@dataclasses.dataclass
+class Args:
+    out_dir: pathlib.Path = pathlib.Path("data/aloha_sim/videos")
+    task: str = "gym_aloha/AlohaTransferCube-v0"
+    seed: int = 0
+    action_horizon: int = 10
+    host: str = "0.0.0.0"
+    port: int = 8000
+    display: bool = False
+def main(args: Args) -> None:
+    runtime = _runtime.Runtime(
+        environment=_env.AlohaSimEnvironment(
+            task=args.task,
+            seed=args.seed,
+        ),
+        agent=_policy_agent.PolicyAgent(
+            policy=action_chunk_broker.ActionChunkBroker(
+                policy=_websocket_client_policy.WebsocketClientPolicy(
+                    host=args.host,
+                    port=args.port,
+                ),
+                action_horizon=args.action_horizon,
+            )
+        ),
+        subscribers=[
+            _saver.VideoSaver(args.out_dir),
+        ],
+        max_hz=50,
+    )
+    runtime.run()
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, force=True)
+    tyro.cli(main)

capvector-pi05/examples/aloha_sim/requirements.in ADDED Viewed

	@@ -0,0 +1,8 @@

+gym-aloha
+imageio
+matplotlib
+msgpack
+numpy>=1.22.4,<2.0.0
+typing-extensions
+tyro
+websockets

capvector-pi05/examples/aloha_sim/requirements.txt ADDED Viewed

	@@ -0,0 +1,132 @@

+# This file was autogenerated by uv via the following command:
+#    uv pip compile examples/aloha_sim/requirements.in -o examples/aloha_sim/requirements.txt --python-version 3.10
+absl-py==2.1.0
+    # via
+    #   dm-control
+    #   dm-env
+    #   labmaze
+    #   mujoco
+certifi==2024.8.30
+    # via requests
+charset-normalizer==3.4.0
+    # via requests
+cloudpickle==3.1.0
+    # via gymnasium
+contourpy==1.3.1
+    # via matplotlib
+cycler==0.12.1
+    # via matplotlib
+dm-control==1.0.14
+    # via gym-aloha
+dm-env==1.6
+    # via dm-control
+dm-tree==0.1.8
+    # via
+    #   dm-control
+    #   dm-env
+docstring-parser==0.16
+    # via tyro
+farama-notifications==0.0.4
+    # via gymnasium
+fonttools==4.55.2
+    # via matplotlib
+glfw==2.8.0
+    # via
+    #   dm-control
+    #   mujoco
+gym-aloha==0.1.1
+    # via -r examples/aloha_sim/requirements.in
+gymnasium==1.0.0
+    # via gym-aloha
+idna==3.10
+    # via requests
+imageio==2.36.1
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   gym-aloha
+imageio-ffmpeg==0.5.1
+    # via imageio
+kiwisolver==1.4.7
+    # via matplotlib
+labmaze==1.0.6
+    # via dm-control
+lxml==5.3.0
+    # via dm-control
+markdown-it-py==3.0.0
+    # via rich
+matplotlib==3.9.3
+    # via -r examples/aloha_sim/requirements.in
+mdurl==0.1.2
+    # via markdown-it-py
+msgpack==1.1.0
+    # via -r examples/aloha_sim/requirements.in
+mujoco==2.3.7
+    # via
+    #   dm-control
+    #   gym-aloha
+numpy==1.26.4
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   contourpy
+    #   dm-control
+    #   dm-env
+    #   gymnasium
+    #   imageio
+    #   labmaze
+    #   matplotlib
+    #   mujoco
+    #   scipy
+packaging==24.2
+    # via matplotlib
+pillow==11.0.0
+    # via
+    #   imageio
+    #   matplotlib
+protobuf==5.29.1
+    # via dm-control
+psutil==6.1.0
+    # via imageio
+pygments==2.18.0
+    # via rich
+pyopengl==3.1.7
+    # via
+    #   dm-control
+    #   mujoco
+pyparsing==3.2.0
+    # via
+    #   dm-control
+    #   matplotlib
+python-dateutil==2.9.0.post0
+    # via matplotlib
+requests==2.32.3
+    # via dm-control
+rich==13.9.4
+    # via tyro
+scipy==1.14.1
+    # via dm-control
+setuptools==75.6.0
+    # via
+    #   dm-control
+    #   imageio-ffmpeg
+    #   labmaze
+shtab==1.7.1
+    # via tyro
+six==1.17.0
+    # via python-dateutil
+tqdm==4.67.1
+    # via dm-control
+typeguard==4.4.1
+    # via tyro
+typing-extensions==4.12.2
+    # via
+    #   -r examples/aloha_sim/requirements.in
+    #   gymnasium
+    #   rich
+    #   typeguard
+    #   tyro
+tyro==0.9.2
+    # via -r examples/aloha_sim/requirements.in
+urllib3==2.2.3
+    # via requests
+websockets==14.1
+    # via -r examples/aloha_sim/requirements.in

capvector-pi05/examples/aloha_sim/saver.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import logging
+import pathlib
+import imageio
+import numpy as np
+from openpi_client.runtime import subscriber as _subscriber
+from typing_extensions import override
+class VideoSaver(_subscriber.Subscriber):
+    """Saves episode data."""
+    def __init__(self, out_dir: pathlib.Path, subsample: int = 1) -> None:
+        out_dir.mkdir(parents=True, exist_ok=True)
+        self._out_dir = out_dir
+        self._images: list[np.ndarray] = []
+        self._subsample = subsample
+    @override
+    def on_episode_start(self) -> None:
+        self._images = []
+    @override
+    def on_step(self, observation: dict, action: dict) -> None:
+        im = observation["images"]["cam_high"]  # [C, H, W]
+        im = np.transpose(im, (1, 2, 0))  # [H, W, C]
+        self._images.append(im)
+    @override
+    def on_episode_end(self) -> None:
+        existing = list(self._out_dir.glob("out_[0-9]*.mp4"))
+        next_idx = max([int(p.stem.split("_")[1]) for p in existing], default=-1) + 1
+        out_path = self._out_dir / f"out_{next_idx}.mp4"
+        logging.info(f"Saving video to {out_path}")
+        imageio.mimwrite(
+            out_path,
+            [np.asarray(x) for x in self._images[:: self._subsample]],
+            fps=50 // max(1, self._subsample),
+        )

capvector-pi05/examples/convert_jax_model_to_pytorch.py ADDED Viewed

	@@ -0,0 +1,587 @@

+#!/usr/bin/env python3
+"""
+Load a JAX model and print all parameter keys, with optional conversion to PyTorch.
+This script loads a JAX model checkpoint using orbax and can either:
+1. Print out all the parameter keys in a hierarchical structure for inspection
+2. Convert the JAX model to PyTorch format using our PI0Pytorch model
+Usage:
+    # Just inspect keys:
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --inspect_only
+    # Convert to PyTorch:
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /path/to/checkpoint --output_path /path/to/output
+Example:
+    # pi0_droid
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_droid_pytorch
+    # pi0_aloha_sim
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_aloha_sim --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi0_aloha_sim_pytorch
+    # pi05_droid
+    python examples/convert_jax_model_to_pytorch.py --checkpoint_dir /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi05_droid --output_path /home/$USER/.cache/openpi/openpi-assets/checkpoints/pi05_droid_pytorch
+"""
+import json
+import os
+import pathlib
+import shutil
+from typing import Literal
+from flax.nnx import traversals
+import numpy as np
+import orbax.checkpoint as ocp
+import safetensors
+import torch
+import tyro
+import openpi.models.gemma
+import openpi.models.model
+import openpi.models.pi0_config
+import openpi.models_pytorch.pi0_pytorch
+from openpi.training import utils
+import openpi.training.config as _config
+def slice_paligemma_state_dict(state_dict, config):
+    """Convert PaliGemma JAX parameters to PyTorch format."""
+    suffix = "/value" if "img/embedding/kernel/value" in state_dict else ""
+    # patch embeddings
+    jax_key = f"img/embedding/kernel{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose(3, 2, 0, 1)
+    jax_key = f"img/embedding/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.patch_embedding.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # positional embeddings
+    jax_key = f"img/pos_embedding{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.embeddings.position_embedding.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).reshape(-1, config.vision_config.hidden_size)
+    # extract vision layers to be sliced at index 0. There are 27 layers in the base model.
+    encoderblock_layernorm0_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_0/scale{suffix}")
+    encoderblock_layernorm0_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_0/bias{suffix}")
+    encoderblock_layernorm1_scale = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/scale{suffix}")
+    encoderblock_layernorm1_bias = state_dict.pop(f"img/Transformer/encoderblock/LayerNorm_1/bias{suffix}")
+    encoderblock_mlp_dense0_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/kernel{suffix}")
+    encoderblock_mlp_dense0_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_0/bias{suffix}")
+    encoderblock_mlp_dense1_kernel = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/kernel{suffix}")
+    encoderblock_mlp_dense1_bias = state_dict.pop(f"img/Transformer/encoderblock/MlpBlock_0/Dense_1/bias{suffix}")
+    encoderblock_attention_0_key_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/kernel{suffix}"
+    )
+    encoderblock_attention_0_key_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/key/bias{suffix}"
+    )
+    encoderblock_attention_0_value_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/kernel{suffix}"
+    )
+    encoderblock_attention_0_value_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/value/bias{suffix}"
+    )
+    encoderblock_attention_0_query_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/kernel{suffix}"
+    )
+    encoderblock_attention_0_query_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/query/bias{suffix}"
+    )
+    encoderblock_attention_0_out_kernel = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/kernel{suffix}"
+    )
+    encoderblock_attention_0_out_bias = state_dict.pop(
+        f"img/Transformer/encoderblock/MultiHeadDotProductAttention_0/out/bias{suffix}"
+    )
+    for i in range(config.vision_config.num_hidden_layers):
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.weight"
+        ] = encoderblock_layernorm0_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm1.bias"
+        ] = encoderblock_layernorm0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.weight"
+        ] = encoderblock_layernorm1_scale[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.layer_norm2.bias"
+        ] = encoderblock_layernorm1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.weight"
+        ] = encoderblock_mlp_dense0_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc1.bias"
+        ] = encoderblock_mlp_dense0_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.weight"
+        ] = encoderblock_mlp_dense1_kernel[i].transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.mlp.fc2.bias"
+        ] = encoderblock_mlp_dense1_bias[i]
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.weight"
+        ] = encoderblock_attention_0_key_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.k_proj.bias"
+        ] = encoderblock_attention_0_key_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.weight"
+        ] = encoderblock_attention_0_value_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.v_proj.bias"
+        ] = encoderblock_attention_0_value_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.weight"
+        ] = encoderblock_attention_0_query_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.q_proj.bias"
+        ] = encoderblock_attention_0_query_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.weight"
+        ] = encoderblock_attention_0_out_kernel[i].reshape(-1, config.vision_config.hidden_size).transpose()
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.vision_tower.vision_model.encoder.layers.{i}.self_attn.out_proj.bias"
+        ] = encoderblock_attention_0_out_bias[i].reshape(-1, config.vision_config.hidden_size).reshape(-1)
+    jax_key = f"img/Transformer/encoder_norm/scale{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
+    jax_key = f"img/Transformer/encoder_norm/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.vision_tower.vision_model.post_layernorm.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # multimodal projector
+    jax_key = f"img/head/kernel{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key).transpose()
+    jax_key = f"img/head/bias{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.multi_modal_projector.linear.bias"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # text decoder (gemma)
+    jax_key = f"llm/embedder/input_embedding{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.language_model.embed_tokens.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    # pop the einsum attention + mlp representations
+    llm_attention_attn_vec_einsum = state_dict.pop(f"llm/layers/attn/attn_vec_einsum/w{suffix}")
+    llm_attention_kv_einsum = state_dict.pop(f"llm/layers/attn/kv_einsum/w{suffix}")
+    llm_attention_q_einsum = state_dict.pop(f"llm/layers/attn/q_einsum/w{suffix}")
+    llm_mlp_gating_einsum = state_dict.pop(f"llm/layers/mlp/gating_einsum{suffix}")
+    llm_mlp_linear = state_dict.pop(f"llm/layers/mlp/linear{suffix}")
+    llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm/scale{suffix}")
+    llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm/scale{suffix}")
+    for i in range(config.text_config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
+        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
+        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .transpose(2, 0, 1)
+            .reshape(
+                config.text_config.num_attention_heads * config.text_config.head_dim, config.text_config.hidden_size
+            )
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
+        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
+        up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.mlp.down_proj.weight"] = (
+            llm_mlp_linear[i].transpose()
+        )
+        state_dict[f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.input_layernorm.weight"] = (
+            llm_input_layernorm[i]
+        )
+        state_dict[
+            f"paligemma_with_expert.paligemma.model.language_model.layers.{i}.post_attention_layernorm.weight"
+        ] = llm_post_attention_layernorm[i]
+    jax_key = f"llm/final_norm/scale{suffix}"
+    pytorch_key = "paligemma_with_expert.paligemma.model.language_model.norm.weight"
+    state_dict[pytorch_key] = state_dict.pop(jax_key)
+    expert_dict = {}
+    final_state_dict = {}
+    # Expert-related keys to extract (including pi05 Dense layer parameters)
+    expert_keys = [
+        f"llm/final_norm_1/scale{suffix}",
+        f"llm/final_norm_1/Dense_0/bias{suffix}",
+        f"llm/final_norm_1/Dense_0/kernel{suffix}",
+        f"llm/layers/attn/attn_vec_einsum_1/w{suffix}",
+        f"llm/layers/attn/kv_einsum_1/w{suffix}",
+        f"llm/layers/attn/q_einsum_1/w{suffix}",
+        f"llm/layers/mlp_1/gating_einsum{suffix}",
+        f"llm/layers/mlp_1/linear{suffix}",
+        f"llm/layers/pre_attention_norm_1/scale{suffix}",
+        f"llm/layers/pre_attention_norm_1/Dense_0/bias{suffix}",
+        f"llm/layers/pre_attention_norm_1/Dense_0/kernel{suffix}",
+        f"llm/layers/pre_ffw_norm_1/scale{suffix}",
+        f"llm/layers/pre_ffw_norm_1/Dense_0/bias{suffix}",
+        f"llm/layers/pre_ffw_norm_1/Dense_0/kernel{suffix}",
+    ]
+    for key, value in state_dict.items():
+        if key not in expert_keys:
+            final_state_dict[key] = torch.from_numpy(value)
+        else:
+            expert_dict[key] = value
+    return final_state_dict, expert_dict
+def slice_gemma_state_dict(state_dict, config, *, num_expert, checkpoint_dir, pi05):
+    """Convert Gemma JAX parameters to PyTorch format."""
+    # Add missing attributes to config if they don't exist
+    if not hasattr(config, "vocab_size"):
+        config.vocab_size = 257152  # PALIGEMMA_VOCAB_SIZE
+    if not hasattr(config, "hidden_size"):
+        config.hidden_size = config.width
+    if not hasattr(config, "num_hidden_layers"):
+        config.num_hidden_layers = config.depth
+    if not hasattr(config, "num_attention_heads"):
+        config.num_attention_heads = config.num_heads
+    suffix = "/value" if f"llm/layers/attn/attn_vec_einsum_{num_expert}/w/value" in state_dict else ""
+    llm_attention_attn_vec_einsum = state_dict.pop(f"llm/layers/attn/attn_vec_einsum_{num_expert}/w{suffix}")
+    llm_attention_kv_einsum = state_dict.pop(f"llm/layers/attn/kv_einsum_{num_expert}/w{suffix}")
+    llm_attention_q_einsum = state_dict.pop(f"llm/layers/attn/q_einsum_{num_expert}/w{suffix}")
+    llm_mlp_gating_einsum = state_dict.pop(f"llm/layers/mlp_{num_expert}/gating_einsum{suffix}")
+    llm_mlp_linear = state_dict.pop(f"llm/layers/mlp_{num_expert}/linear{suffix}")
+    # Check if we have Dense layers (for pi05/adaptive normalization) or scale layers (for regular pi0)
+    if "pi05" in checkpoint_dir:
+        # Pi05 with adaptive normalization
+        llm_input_layernorm_bias = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/bias{suffix}")
+        llm_post_attention_layernorm_bias = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/bias{suffix}")
+        llm_input_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_attention_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
+        llm_post_attention_layernorm_kernel = state_dict.pop(
+            f"llm/layers/pre_ffw_norm_{num_expert}/Dense_0/kernel{suffix}"
+        )
+    else:
+        # Regular pi0 with standard RMSNorm
+        llm_input_layernorm = state_dict.pop(f"llm/layers/pre_attention_norm_{num_expert}/scale{suffix}")
+        llm_post_attention_layernorm = state_dict.pop(f"llm/layers/pre_ffw_norm_{num_expert}/scale{suffix}")
+    for i in range(config.num_hidden_layers):
+        q_proj_weight_reshaped = (
+            llm_attention_q_einsum[i]
+            .transpose(0, 2, 1)
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.q_proj.weight"] = (
+            q_proj_weight_reshaped
+        )
+        k_proj_weight_reshaped = llm_attention_kv_einsum[i, 0, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.k_proj.weight"] = (
+            k_proj_weight_reshaped
+        )
+        v_proj_weight_reshaped = llm_attention_kv_einsum[i, 1, 0].transpose()
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.v_proj.weight"] = (
+            v_proj_weight_reshaped
+        )
+        o_proj_weight_reshaped = (
+            llm_attention_attn_vec_einsum[i]
+            .reshape(config.num_attention_heads * config.head_dim, config.hidden_size)
+            .transpose(1, 0)
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.self_attn.o_proj.weight"] = (
+            o_proj_weight_reshaped
+        )
+        gate_proj_weight = llm_mlp_gating_einsum[i, 0]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.gate_proj.weight"] = (
+            gate_proj_weight.transpose()
+        )
+        up_proj_weight = llm_mlp_gating_einsum[i, 1]
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.up_proj.weight"] = (
+            up_proj_weight.transpose()
+        )
+        state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.mlp.down_proj.weight"] = llm_mlp_linear[
+            i
+        ].transpose()
+        if "pi05" in checkpoint_dir:
+            # Pi05 with adaptive normalization - use Dense layer parameters directly
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.bias"] = (
+                llm_input_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.bias"] = (
+                llm_post_attention_layernorm_bias[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.dense.weight"] = (
+                llm_input_layernorm_kernel[i].transpose()
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.dense.weight"] = (
+                llm_post_attention_layernorm_kernel[i].transpose()
+            )
+        else:
+            # Regular pi0 with standard RMSNorm
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.input_layernorm.weight"] = (
+                llm_input_layernorm[i]
+            )
+            state_dict[f"paligemma_with_expert.gemma_expert.model.layers.{i}.post_attention_layernorm.weight"] = (
+                llm_post_attention_layernorm[i]
+            )
+    # Handle final norm layer
+    if "pi05" in checkpoint_dir:
+        # Pi05 with adaptive normalization - use Dense layer parameters directly
+        final_norm_bias = state_dict.pop(f"llm/final_norm_{num_expert}/Dense_0/bias{suffix}")
+        final_norm_kernel = state_dict.pop(f"llm/final_norm_{num_expert}/Dense_0/kernel{suffix}")
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.bias"] = final_norm_bias
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.dense.weight"] = final_norm_kernel.transpose()
+    else:
+        # Regular pi0 with standard RMSNorm
+        state_dict["paligemma_with_expert.gemma_expert.model.norm.weight"] = state_dict.pop(
+            f"llm/final_norm_{num_expert}/scale{suffix}"
+        )
+        # state_dict["paligemma_with_expert.gemma_expert.lm_head.weight"] = embedding_vector # weights are tied.
+    final_state_dict = {}
+    for key, value in state_dict.items():
+        if not isinstance(value, torch.Tensor):
+            final_state_dict[key] = torch.from_numpy(value)
+        else:
+            final_state_dict[key] = value
+    return final_state_dict
+def slice_initial_orbax_checkpoint(checkpoint_dir: str, restore_precision: str | None = None):
+    """Load and process params by restoring via JAX model loader first.
+    This respects dtype conversions that occur during model restore.
+    """
+    # Use repository restore utility to load a pure dict of params (value suffix removed)
+    params = openpi.models.model.restore_params(
+        f"{checkpoint_dir}/params/", restore_type=np.ndarray, dtype=restore_precision
+    )
+    return {"paligemma_params": traversals.flatten_mapping(params["PaliGemma"], sep="/"), "projection_params": params}
+def load_jax_model_and_print_keys(checkpoint_dir: str):
+    """
+    Load JAX model from checkpoint and print all parameter keys.
+    Args:
+        checkpoint_dir: Path to the checkpoint directory
+    """
+    checkpoint_dir = os.path.abspath(checkpoint_dir) if not checkpoint_dir.startswith("gs://") else checkpoint_dir
+    # Initialize checkpointer
+    checkpointer = ocp.PyTreeCheckpointer()
+    metadata = checkpointer.metadata(f"{checkpoint_dir}/params")
+    print(utils.array_tree_to_info(metadata))
+def convert_pi0_checkpoint(
+    checkpoint_dir: str, precision: str, output_path: str, model_config: openpi.models.pi0_config.Pi0Config
+):
+    """
+    Convert PI0 JAX checkpoint to PyTorch format.
+    Args:
+        checkpoint_dir: Path to the JAX checkpoint
+        precision: Model precision (float32, bfloat16, float16)
+        output_path: Path to save the converted PyTorch model
+        model_config: Model config
+    """
+    print(f"Converting PI0 checkpoint from {checkpoint_dir} to {output_path}")
+    print(f"Model config: {model_config}")
+    # Break down orbax ckpts by restoring via JAX to respect dtype
+    initial_params = slice_initial_orbax_checkpoint(checkpoint_dir=checkpoint_dir, restore_precision="float32")
+    # Process projection params
+    if model_config.pi05:
+        keys = [
+            "action_in_proj",
+            "action_out_proj",
+            "time_mlp_in",
+            "time_mlp_out",
+        ]
+    else:
+        keys = [
+            "state_proj",
+            "action_in_proj",
+            "action_out_proj",
+            "action_time_mlp_in",
+            "action_time_mlp_out",
+        ]
+    projection_params = {}
+    for key in keys:
+        kernel_params = initial_params["projection_params"][key]["kernel"]
+        bias_params = initial_params["projection_params"][key]["bias"]
+        if isinstance(kernel_params, dict):
+            weight = kernel_params["value"]
+            bias = bias_params["value"]
+        else:
+            weight = kernel_params
+            bias = bias_params
+        pytorch_weight_key = f"{key}.weight"
+        pytorch_bias_key = f"{key}.bias"
+        projection_params[pytorch_weight_key] = torch.from_numpy(np.array(weight)).T
+        projection_params[pytorch_bias_key] = torch.from_numpy(np.array(bias))
+    # Create configs based on checkpoint path
+    # All models use the same PaliGemma config structure
+    class PaliGemmaConfig:
+        def __init__(self):
+            self.vision_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 1152,
+                    "num_hidden_layers": 27,
+                    "num_attention_heads": 16,
+                    "intermediate_size": 4304,
+                    "patch_size": 14,
+                    "projection_dim": 2048,
+                },
+            )()
+            self.text_config = type(
+                "obj",
+                (object,),
+                {
+                    "hidden_size": 2048,
+                    "num_hidden_layers": 18,
+                    "num_attention_heads": 8,
+                    "head_dim": 256,
+                    "intermediate_size": 16384,
+                },
+            )()
+    paligemma_config = PaliGemmaConfig()
+    action_expert_config = openpi.models.gemma.get_config("gemma_300m")
+    # Process PaliGemma weights
+    paligemma_params, expert_params = slice_paligemma_state_dict(initial_params["paligemma_params"], paligemma_config)
+    # Process Gemma weights from expert_params
+    gemma_params = slice_gemma_state_dict(
+        expert_params, action_expert_config, num_expert=1, checkpoint_dir=checkpoint_dir, pi05=model_config.pi05
+    )
+    # Instantiate model
+    pi0_model = openpi.models_pytorch.pi0_pytorch.PI0Pytorch(model_config)
+    # Combine all parameters (no prefix needed for our model structure)
+    all_params = {**paligemma_params, **gemma_params, **projection_params}
+    # Load state dict
+    pi0_model.load_state_dict(all_params, strict=False)
+    if precision == "float32":
+        pi0_model = pi0_model.to(torch.float32)
+    elif precision == "bfloat16":
+        pi0_model = pi0_model.to(torch.bfloat16)
+    else:
+        raise ValueError(f"Invalid precision: {precision}")
+    # Save the converted model using safetensors
+    os.makedirs(output_path, exist_ok=True)
+    # Save model weights as SafeTensors using save_model to handle tied weights
+    safetensors.torch.save_model(pi0_model, os.path.join(output_path, "model.safetensors"))
+    # Copy assets folder if it exists
+    assets_source = pathlib.Path(checkpoint_dir).parent / "assets"
+    if assets_source.exists():
+        assets_dest = pathlib.Path(output_path) / "assets"
+        if assets_dest.exists():
+            shutil.rmtree(assets_dest)
+        shutil.copytree(assets_source, assets_dest)
+    # Save config as JSON for reference
+    config_dict = {
+        "action_dim": model_config.action_dim,
+        "action_horizon": model_config.action_horizon,
+        "paligemma_variant": model_config.paligemma_variant,
+        "action_expert_variant": model_config.action_expert_variant,
+        "precision": precision,
+    }
+    with open(os.path.join(output_path, "config.json"), "w") as f:
+        json.dump(config_dict, f, indent=2)
+    print("Model conversion completed successfully!")
+    print(f"Model saved to {output_path}")
+def main(
+    checkpoint_dir: str,
+    config_name: str,
+    output_path: str | None = None,
+    precision: Literal["float32", "bfloat16", "float16"] = "bfloat16",
+    *,
+    inspect_only: bool = False,
+):
+    """Load JAX model and optionally convert to PyTorch.
+    Args:
+        checkpoint_dir: Path to the JAX checkpoint directory
+        output_path: Path to save converted PyTorch model (required for conversion)
+        precision: Precision for model conversion
+        inspect_only: Only inspect parameter keys, don't convert
+    """
+    model_config = _config.get_config(config_name).model
+    if not isinstance(model_config, openpi.models.pi0_config.Pi0Config):
+        raise ValueError(f"Config {config_name} is not a Pi0Config")
+    if inspect_only:
+        load_jax_model_and_print_keys(checkpoint_dir)
+    else:
+        if not output_path:
+            print("Error: --output_path is required for conversion. Use --inspect_only to only view keys.")
+            return
+        convert_pi0_checkpoint(checkpoint_dir, precision, output_path, model_config)
+if __name__ == "__main__":
+    tyro.cli(main)

capvector-pi05/examples/droid/README.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# DROID Policies in openpi
+We offer instructions for:
+- [Running inference for our best $pi_{0.5}$-DROID policy](./README.md#running-droid-inference)
+- [Running inference for other pre-trained DROID policies ($\pi_0$, $\pi_0$-FAST, ...)](./README.md#running-roboarena-baseline-policies)
+- [Pre-training *generalist* policies on the *full* DROID dataset](./README_train.md#training-on-droid)
+- [Fine-tuning expert $\pi_{0.5}$ on your custom DROID dataset](./README_train.md#fine-tuning-on-custom-droid-datasets)
+## Running DROID Inference
+This example shows how to run the fine-tuned $\pi_{0.5}$-DROID model on the [DROID robot platform](https://github.com/droid-dataset/droid). Based on the [public RoboArena benchmark](https://robo-arena.github.io/leaderboard), this is currently our strongest generalist DROID policy.
+### Step 1: Start a policy server
+Since the DROID control laptop does not have a powerful GPU, we will start a remote policy server on a different machine with a more powerful GPU and then query it from the DROID control laptop during inference.
+1. On a machine with a powerful GPU (~NVIDIA 4090), clone and install the `openpi` repository following the instructions in the [README](https://github.com/Physical-Intelligence/openpi).
+2. Start the OpenPI server via the following command:
+```bash
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi05_droid --policy.dir=gs://openpi-assets/checkpoints/pi05_droid
+```
+You can also run the equivalent command below:
+```bash
+uv run scripts/serve_policy.py --env=DROID
+```
+### Step 2: Run the DROID robot
+1. Make sure you have the most recent version of the DROID package installed on both the DROID control laptop and the NUC.
+2. On the control laptop, activate your DROID conda environment.
+3. Clone the openpi repo and install the openpi client, which we will use to connect to the policy server (this has very few dependencies and should be very fast to install): with the DROID conda environment activated, run `cd $OPENPI_ROOT/packages/openpi-client && pip install -e .`.
+4. Install `tyro`, which we will use for command line parsing: `pip install tyro`.
+5. Copy the `main.py` file from this directory to the `$DROID_ROOT/scripts` directory.
+6. Replace the camera IDs in the `main.py` file with the IDs of your cameras (you can find the camera IDs by running `ZED_Explorer` in the command line, which will open a tool that shows you all connected cameras and their IDs -- you can also use it to make sure that the cameras are well-positioned to see the scene you want the robot to interact with).
+7. Run the `main.py` file. Make sure to point the IP and host address to the policy server. (To make sure the server machine is reachable from the DROID laptop, you can run `ping <server_ip>` from the DROID laptop.) Also make sure to specify the external camera to use for the policy (we only input one external camera), choose from ["left", "right"].
+```bash
+python3 scripts/main.py --remote_host=<server_ip> --remote_port=<server_port> --external_camera="left"
+```
+The script will ask you to enter a free-form language instruction for the robot to follow. Make sure to point the cameras at the scene you want the robot to interact with. You _do not_ need to carefully control camera angle, object positions, etc. The policy is fairly robust in our experience. Happy prompting!
+## Troubleshooting
+| Issue | Solution |
+|-------|----------|
+| Cannot reach policy server | Make sure the server is running and the IP and port are correct. You can check that the server machine is reachable by running `ping <server_ip>` from the DROID laptop. |
+| Cannot find cameras | Make sure the camera IDs are correct and that the cameras are connected to the DROID laptop. Sometimes replugging the cameras can help. You can check all connected cameras by running `ZED_Explore` in the command line. |
+| Policy inference is slow / inconsistent | Try using a wired internet connection for the DROID laptop to reduce latency (0.5 - 1 sec latency per chunk is normal). |
+| Policy does not perform the task well | In our experiments, the policy could perform simple table top manipulation tasks (pick-and-place) across a wide range of environments, camera positions, and lighting conditions. If the policy does not perform the task well, you can try modifying the scene or object placement to make the task easier. Also make sure that the camera view you are passing to the policy can see all relevant objects in the scene (the policy is only conditioned on a single external camera + wrist camera, make sure you are feeding the desired camera to the policy). Use `ZED_Explore` to check that the camera view you are passing to the policy can see all relevant objects in the scene. Finally, the policy is far from perfect and will fail on more complex manipulation tasks, but it usually makes a decent effort. :) |
+## Running Other Policies
+We provide configs for running the baseline DROID policies from the [RoboArena](https://robo-arena.github.io/) paper. Simply run the commands below to start inference servers for the respective policies. Then follow the instructions above to run evaluation on the DROID robot.
+```
+# Train from pi0-FAST, using FAST tokenizer
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_fast_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_fast_droid
+# Train from pi0, using flow matching
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=pi0_droid --policy.dir=gs://openpi-assets/checkpoints/pi0_droid
+# Trained from PaliGemma, using RT-2 / OpenVLA style binning tokenizer.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_binning_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_binning_droid
+# Trained from PaliGemma, using FAST tokenizer (using universal FAST+ tokenizer).
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_fast_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_fast_droid
+# Trained from PaliGemma, using FAST tokenizer (tokenizer trained on DROID dataset).
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_fast_specialist_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_fast_specialist_droid
+# Trained from PaliGemma, using FSQ tokenizer.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_vq_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_vq_droid
+# pi0-style diffusion / flow VLA, trained on DROID from PaliGemma.
+uv run scripts/serve_policy.py policy:checkpoint --policy.config=paligemma_diffusion_droid --policy.dir=gs://openpi-assets/checkpoints/roboarena/paligemma_diffusion_droid
+```
+You can find the inference configs in [roboarena_config.py](../../src/openpi/training/misc/roboarena_config.py).

capvector-pi05/examples/droid/README_train.md ADDED Viewed

	@@ -0,0 +1,106 @@

+# Training on DROID
+Here we describe how to fine-tune the pi0.5 model on the *full* DROID dataset. This is an approximate open-source reproduction of the pi05-DROID training pipeline.
+(small differences in data loading and the used action space) -- For a tutorial on how to fine-tune your model with a smaller, custom dataset collected on the DROID platform, see below.
+In contrast to the rest of openpi, which uses LeRobot for data loading, we need to use RLDS as the data format for full DROID training (since at the moment LeRobot isn't scalable enough
+for larger datasets like DROID -- they are working on improving it though). Below, we provide instructions for updating your openpi environment for RLDS data loading and where to download the DROID dataset.
+## Install
+We need a few additional dependencies for RLDS data loading. Run:
+```bash
+uv sync --group rlds
+```
+## Download DROID dataset
+You can download the DROID dataset with the following command (after installing the `gsutil` google cloud CLI):
+```
+gsutil -m cp -r gs://gresearch/robotics/droid/1.0.1 <your_download_path>/droid/1.0.1
+```
+Note that downloading version 1.0.1 is important (not v1.0.0): it contains the complete set of language annotations (~75k episodes) while v1.0.0 only has annotations for 30k episodes. If for some reason you would like to use another version, modify the line `version="1.0.1"` in the `DroidRldsDataset` object [here](src/openpi/training/droid_rlds_dataset.py).
+You will need 1.8TB of disk storage to download the DROID RLDS dataset.
+## Run
+First, change the `rlds_data_dir` path in your `TrainConfig` to the directory that you downloaded the `droid` dataset into (see [src/openpi/training/config.py](src/openpi/training/config.py)).
+Then, compute normalization statistics (this will take ~10 minutes):
+```bash
+uv run --group rlds scripts/compute_norm_stats.py --config-name pi05_full_droid_finetune --max-frames 10_000_000
+```
+Run training:
+```bash
+XLA_PYTHON_CLIENT_MEM_FRACTION=0.9 uv run --group rlds scripts/train.py pi05_full_droid_finetune --exp-name=my_experiment --overwrite
+```
+**Note**: The original pi0.5-DROID model was trained with joint velocity actions.
+Joint velocity actions are not compatible with simulated evaluation environments (much harder to simulate).
+Thus, we do not recommend training with joint velocity actions and instead use joint position actions here.
+## Compute Requirements
+Our DROID training config requires approximately 2 days on 8x H100 GPUs for convergence (100k iterations, bs256, approx. 1 epoch).
+If you start from PaliGemma instead of pi0 initialization, plan with ~5 days on 8x H100s (240k iterations, i.e. 3 epochs).
+We have experimented with LoRA for cheaper finetuning, but haven't found the policies to perform well so far.
+## Data Filtering
+Like any diverse real-robot dataset, the DROID dataset isn't perfectly "clean" and we have found data filtering to significantly improve policy performance. Concretely, the DROID dataset contains many *idle* timesteps in which the robot does not move (in part due to the VR teleoperation interface that was used during data collection, we will not go into too much detail here). Appropriate filtering of these idle transitions can improve policy performance.
+By default, our openpi training recipe implements the same idle filter used to train all pi-DROID models. We implement it by pre-computing which dataset indices to sample during training. You can check [compute_droid_nonidle_ranges.py](examples/droid/compute_droid_nonidle_ranges.py) for how we compute these indices. Roughly speaking, we filter any time steps for which the next chunk of actions would be largely idle. During training, our code automatically pulls our pre-computed list of indices from cloud storage and applies them. If you want to modify the idle filter / create your custom sampling logic, you can modify our script to generate a new index list and provide it via the `filter_dict_path="<path_to_filter_dict>"` argument in [src/openpi/training/config.py](src/openpi/training/config.py).
+**Note**: our list of filtering indices is only valid for the `droid/1.0.1` dataset mentioned in the download section above, and will not provide valid filtering for any other version of the DROID dataset, so make sure you download the dataset above! If you have a custom DROID version, you can rerun the [compute_droid_nonidle_ranges.py](examples/droid/compute_droid_nonidle_ranges.py) script to generate a new list of sampling indices.
+## RoboArena
+Consider submitting your DROID policies to the [RoboArena benchmark](https://robo-arena.github.io/), which allows you to evaluate your policies on diverse tasks & scenes, **in the real world**! :)
+If you have questions about RoboArena, please email [karl.pertsch@gmail.com](mailto:karl.pertsch@gmail.com).
+# Fine-Tuning on Custom DROID Datasets
+Here we describe how to fine-tune a model on a custom (smaller) dataset collected on the DROID platform. Like for other datasets, we will first convert the custom DROID dataset to LeRobot and then fine-tune a model (pi05-droid) on it.
+Note: We use LeRobot here, since we assume the custom DROID fine-tuning dataset to be relatively small (<10s of hours). For larger datasets (like the full DROID dataset) we recommend using RLDS for it's better efficiency (see the example above).
+## Step 1: Converting your custom DROID dataset to LeRobot
+We will use a small subset of the real DROID dataset for this example. This is a subset of just 30 demonstrations -- we assume that you will use your own dataset instead, but here is the command to download our subset (1.6GB):
+```
+gsutil -m cp -r gs://gresearch/robotics/droid_raw/1.0.1/IRIS/success/2023-12-04 <your_target_path>
+```
+We will also download the language annotations for the DROID dataset so we can pair our demonstrations with language instructions. Again, for your own data you can manually enter your language instructions and don't need to download our annotations. To download the DROID language annotations (12MB), run:
+```
+gsutil -m cp -r gs://gresearch/robotics/droid_raw/1.0.1/aggregated-annotations-030724.json <your_target_dir>
+```
+For your own dataset, make sure that each episode's directory contains a folder called `recordings/MP4` -- if not, you need to first run the MP4 video extraction (from SVO files) using the script [here](https://github.com/droid-dataset/droid/blob/main/scripts/convert/svo_to_mp4.py).
+Now, we will use the `convert_droid_to_lerobot.py` script to create a LeRobot version of this dataset (takes <5min for the 30 demonstrations):
+```
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir <your_target_path>
+```
+## Step 2: Run fine-tuning with your custom dataset
+Now we can run fine-tuning with our converted custom dataset. We provide an example config for fine-tuning `pi05_droid` on the custom dataset we created.
+You can modify the config easily to work with other base models, or use your custom DROID dataset in `config.py` (seach for `pi05_droid_finetune`).
+To launch training:
+```
+uv run scripts/train.py pi05_droid_finetune --exp-name=my_experiment --overwrite
+```
+Once trained, you can follow the instructions in [`examples/droid/README.md`](examples/droid/README.md) to serve the policy and run it on the robot.

capvector-pi05/examples/droid/compute_droid_nonidle_ranges.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+Iterates through the DROID dataset and creates a json mapping from episode unique IDs to ranges of time steps
+that should be sampled during training (all others are filtered out).
+Filtering logic:
+We look for ranges of consecutive steps that contain at most min_idle_len consecutive idle frames
+(default to 7 -- as most DROID action-chunking policies run the first 8 actions generated in each chunk, filtering
+this way means the policy will not get stuck outputting stationary actions). Additionally, we also only keep non-idle
+ranges of length at least min_non_idle_len (default to 16 frames = ~1 second), while also removing the last
+filter_last_n_in_ranges frames from the end of each range (as those all correspond to action chunks with many idle actions).
+This leaves us with trajectory segments consisting of contiguous, significant movement. Training on this filtered set
+yields policies that output fewer stationary actions (i.e., get "stuck" in states less).
+"""
+import json
+import os
+from pathlib import Path
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from tqdm import tqdm
+os.environ["CUDA_VISIBLE_DEVICES"] = ""  # Set to the GPU you want to use, or leave empty for CPU
+builder = tfds.builder_from_directory(
+    # path to the `droid` directory (not its parent)
+    builder_dir="<path_to_droid_dataset_tfds_files>",
+)
+ds = builder.as_dataset(split="train", shuffle_files=False)
+tf.data.experimental.ignore_errors(ds)
+keep_ranges_path = "<path_to_where_to_save_the_json>"
+min_idle_len = 7  # If more than this number of consecutive idle frames, filter all of them out
+min_non_idle_len = 16  # If fewer than this number of consecutive non-idle frames, filter all of them out
+filter_last_n_in_ranges = 10  # When using a filter dict, remove this many frames from the end of each range
+keep_ranges_map = {}
+if Path(keep_ranges_path).exists():
+    with Path(keep_ranges_path).open("r") as f:
+        keep_ranges_map = json.load(f)
+    print(f"Resuming from {len(keep_ranges_map)} episodes already processed")
+for ep_idx, ep in enumerate(tqdm(ds)):
+    recording_folderpath = ep["episode_metadata"]["recording_folderpath"].numpy().decode()
+    file_path = ep["episode_metadata"]["file_path"].numpy().decode()
+    key = f"{recording_folderpath}--{file_path}"
+    if key in keep_ranges_map:
+        continue
+    joint_velocities = [step["action_dict"]["joint_velocity"].numpy() for step in ep["steps"]]
+    joint_velocities = np.array(joint_velocities)
+    is_idle_array = np.hstack(
+        [np.array([False]), np.all(np.abs(joint_velocities[1:] - joint_velocities[:-1]) < 1e-3, axis=1)]
+    )
+    # Find what steps go from idle to non-idle and vice-versa
+    is_idle_padded = np.concatenate(
+        [[False], is_idle_array, [False]]
+    )  # Start and end with False, so idle at first step is a start of motion
+    is_idle_diff = np.diff(is_idle_padded.astype(int))
+    is_idle_true_starts = np.where(is_idle_diff == 1)[0]  # +1 transitions --> going from idle to non-idle
+    is_idle_true_ends = np.where(is_idle_diff == -1)[0]  # -1 transitions --> going from non-idle to idle
+    # Find which steps correspond to idle segments of length at least min_idle_len
+    true_segment_masks = (is_idle_true_ends - is_idle_true_starts) >= min_idle_len
+    is_idle_true_starts = is_idle_true_starts[true_segment_masks]
+    is_idle_true_ends = is_idle_true_ends[true_segment_masks]
+    keep_mask = np.ones(len(joint_velocities), dtype=bool)
+    for start, end in zip(is_idle_true_starts, is_idle_true_ends, strict=True):
+        keep_mask[start:end] = False
+    # Get all non-idle ranges of at least 16
+    # Same logic as above, but for keep_mask, allowing us to filter out contiguous ranges of length < min_non_idle_len
+    keep_padded = np.concatenate([[False], keep_mask, [False]])
+    keep_diff = np.diff(keep_padded.astype(int))
+    keep_true_starts = np.where(keep_diff == 1)[0]  # +1 transitions --> going from filter out to keep
+    keep_true_ends = np.where(keep_diff == -1)[0]  # -1 transitions --> going from keep to filter out
+    # Find which steps correspond to non-idle segments of length at least min_non_idle_len
+    true_segment_masks = (keep_true_ends - keep_true_starts) >= min_non_idle_len
+    keep_true_starts = keep_true_starts[true_segment_masks]
+    keep_true_ends = keep_true_ends[true_segment_masks]
+    # Add mapping from episode unique ID key to list of non-idle ranges to keep
+    keep_ranges_map[key] = []
+    for start, end in zip(keep_true_starts, keep_true_ends, strict=True):
+        keep_ranges_map[key].append((int(start), int(end) - filter_last_n_in_ranges))
+    if ep_idx % 1000 == 0:
+        with Path(keep_ranges_path).open("w") as f:
+            json.dump(keep_ranges_map, f)
+print("Done!")
+with Path(keep_ranges_path).open("w") as f:
+    json.dump(keep_ranges_map, f)

capvector-pi05/examples/droid/convert_droid_data_to_lerobot.py ADDED Viewed

	@@ -0,0 +1,477 @@

+"""
+Minimal example script for converting a dataset collected on the DROID platform to LeRobot format.
+Usage:
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir /path/to/your/data
+If you want to push your dataset to the Hugging Face Hub, you can use the following command:
+uv run examples/droid/convert_droid_data_to_lerobot.py --data_dir /path/to/your/data --push_to_hub
+The resulting dataset will get saved to the $LEROBOT_HOME directory.
+"""
+from collections import defaultdict
+import copy
+import glob
+import json
+from pathlib import Path
+import shutil
+import cv2
+import h5py
+from lerobot.common.datasets.lerobot_dataset import HF_LEROBOT_HOME
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import tyro
+REPO_NAME = "your_hf_username/my_droid_dataset"  # Name of the output dataset, also used for the Hugging Face Hub
+def resize_image(image, size):
+    image = Image.fromarray(image)
+    return np.array(image.resize(size, resample=Image.BICUBIC))
+def main(data_dir: str, *, push_to_hub: bool = False):
+    # Clean up any existing dataset in the output directory
+    output_path = HF_LEROBOT_HOME / REPO_NAME
+    if output_path.exists():
+        shutil.rmtree(output_path)
+    data_dir = Path(data_dir)
+    # Create LeRobot dataset, define features to store
+    # We will follow the DROID data naming conventions here.
+    # LeRobot assumes that dtype of image data is `image`
+    dataset = LeRobotDataset.create(
+        repo_id=REPO_NAME,
+        robot_type="panda",
+        fps=15,  # DROID data is typically recorded at 15fps
+        features={
+            # We call this "left" since we will only use the left stereo camera (following DROID RLDS convention)
+            "exterior_image_1_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),  # This is the resolution used in the DROID RLDS dataset
+                "names": ["height", "width", "channel"],
+            },
+            "exterior_image_2_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "wrist_image_left": {
+                "dtype": "image",
+                "shape": (180, 320, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "joint_position": {
+                "dtype": "float32",
+                "shape": (7,),
+                "names": ["joint_position"],
+            },
+            "gripper_position": {
+                "dtype": "float32",
+                "shape": (1,),
+                "names": ["gripper_position"],
+            },
+            "actions": {
+                "dtype": "float32",
+                "shape": (8,),  # We will use joint *velocity* actions here (7D) + gripper position (1D)
+                "names": ["actions"],
+            },
+        },
+        image_writer_threads=10,
+        image_writer_processes=5,
+    )
+    # Load language annotations
+    # Note: we load the DROID language annotations for this example, but you can manually define them for your own data
+    with (data_dir / "aggregated-annotations-030724.json").open() as f:
+        language_annotations = json.load(f)
+    # Loop over raw DROID fine-tuning datasets and write episodes to the LeRobot dataset
+    # We assume the following directory structure:
+    # RAW_DROID_PATH/
+    #   - <...>/
+    #     - recordings/
+    #        - MP4/
+    #          - <camera_id>.mp4  # single-view video of left stereo pair camera
+    #     - trajectory.hdf5
+    #   - <...>/
+    episode_paths = list(data_dir.glob("**/trajectory.h5"))
+    print(f"Found {len(episode_paths)} episodes for conversion")
+    # We will loop over each dataset_name and write episodes to the LeRobot dataset
+    for episode_path in tqdm(episode_paths, desc="Converting episodes"):
+        # Load raw data
+        recording_folderpath = episode_path.parent / "recordings" / "MP4"
+        trajectory = load_trajectory(str(episode_path), recording_folderpath=str(recording_folderpath))
+        # To load the language instruction, we need to parse out the episode_id from the metadata file
+        # Again, you can modify this step for your own data, to load your own language instructions
+        metadata_filepath = next(iter(episode_path.parent.glob("metadata_*.json")))
+        episode_id = metadata_filepath.name.split(".")[0].split("_")[-1]
+        language_instruction = language_annotations.get(episode_id, {"language_instruction1": "Do something"})[
+            "language_instruction1"
+        ]
+        print(f"Converting episode with language instruction: {language_instruction}")
+        # Write to LeRobot dataset
+        for step in trajectory:
+            camera_type_dict = step["observation"]["camera_type"]
+            wrist_ids = [k for k, v in camera_type_dict.items() if v == 0]
+            exterior_ids = [k for k, v in camera_type_dict.items() if v != 0]
+            dataset.add_frame(
+                {
+                    # Note: need to flip BGR --> RGB for loaded images
+                    "exterior_image_1_left": resize_image(
+                        step["observation"]["image"][exterior_ids[0]][..., ::-1], (320, 180)
+                    ),
+                    "exterior_image_2_left": resize_image(
+                        step["observation"]["image"][exterior_ids[1]][..., ::-1], (320, 180)
+                    ),
+                    "wrist_image_left": resize_image(step["observation"]["image"][wrist_ids[0]][..., ::-1], (320, 180)),
+                    "joint_position": np.asarray(
+                        step["observation"]["robot_state"]["joint_positions"], dtype=np.float32
+                    ),
+                    "gripper_position": np.asarray(
+                        step["observation"]["robot_state"]["gripper_position"][None], dtype=np.float32
+                    ),
+                    # Important: we use joint velocity actions here since pi05-droid was pre-trained on joint velocity actions
+                    "actions": np.concatenate(
+                        [step["action"]["joint_velocity"], step["action"]["gripper_position"][None]], dtype=np.float32
+                    ),
+                    "task": language_instruction,
+                }
+            )
+        dataset.save_episode()
+    # Optionally push to the Hugging Face Hub
+    if push_to_hub:
+        dataset.push_to_hub(
+            tags=["libero", "panda", "rlds"],
+            private=False,
+            push_videos=True,
+            license="apache-2.0",
+        )
+##########################################################################################################
+################ The rest of this file are functions to parse the raw DROID data #########################
+################ You don't need to worry about understanding this part           #########################
+################ It was copied from here: https://github.com/JonathanYang0127/r2d2_rlds_dataset_builder/blob/parallel_convert/r2_d2/r2_d2.py
+##########################################################################################################
+camera_type_dict = {
+    "hand_camera_id": 0,
+    "varied_camera_1_id": 1,
+    "varied_camera_2_id": 1,
+}
+camera_type_to_string_dict = {
+    0: "hand_camera",
+    1: "varied_camera",
+    2: "fixed_camera",
+}
+def get_camera_type(cam_id):
+    if cam_id not in camera_type_dict:
+        return None
+    type_int = camera_type_dict[cam_id]
+    return camera_type_to_string_dict[type_int]
+class MP4Reader:
+    def __init__(self, filepath, serial_number):
+        # Save Parameters #
+        self.serial_number = serial_number
+        self._index = 0
+        # Open Video Reader #
+        self._mp4_reader = cv2.VideoCapture(filepath)
+        if not self._mp4_reader.isOpened():
+            raise RuntimeError("Corrupted MP4 File")
+    def set_reading_parameters(
+        self,
+        image=True,  # noqa: FBT002
+        concatenate_images=False,  # noqa: FBT002
+        resolution=(0, 0),
+        resize_func=None,
+    ):
+        # Save Parameters #
+        self.image = image
+        self.concatenate_images = concatenate_images
+        self.resolution = resolution
+        self.resize_func = cv2.resize
+        self.skip_reading = not image
+        if self.skip_reading:
+            return
+    def get_frame_resolution(self):
+        width = self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_WIDTH)
+        height = self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_HEIGHT)
+        return (width, height)
+    def get_frame_count(self):
+        if self.skip_reading:
+            return 0
+        return int(self._mp4_reader.get(cv2.cv.CV_CAP_PROP_FRAME_COUNT))
+    def set_frame_index(self, index):
+        if self.skip_reading:
+            return
+        if index < self._index:
+            self._mp4_reader.set(cv2.CAP_PROP_POS_FRAMES, index - 1)
+            self._index = index
+        while self._index < index:
+            self.read_camera(ignore_data=True)
+    def _process_frame(self, frame):
+        frame = copy.deepcopy(frame)
+        if self.resolution == (0, 0):
+            return frame
+        return self.resize_func(frame, self.resolution)
+    def read_camera(self, ignore_data=False, correct_timestamp=None):  # noqa: FBT002
+        # Skip if Read Unnecesary #
+        if self.skip_reading:
+            return {}
+        # Read Camera #
+        success, frame = self._mp4_reader.read()
+        self._index += 1
+        if not success:
+            return None
+        if ignore_data:
+            return None
+        # Return Data #
+        data_dict = {}
+        if self.concatenate_images or "stereo" not in self.serial_number:
+            data_dict["image"] = {self.serial_number: self._process_frame(frame)}
+        else:
+            single_width = frame.shape[1] // 2
+            data_dict["image"] = {
+                self.serial_number + "_left": self._process_frame(frame[:, :single_width, :]),
+                self.serial_number + "_right": self._process_frame(frame[:, single_width:, :]),
+            }
+        return data_dict
+    def disable_camera(self):
+        if hasattr(self, "_mp4_reader"):
+            self._mp4_reader.release()
+class RecordedMultiCameraWrapper:
+    def __init__(self, recording_folderpath, camera_kwargs={}):  # noqa: B006
+        # Save Camera Info #
+        self.camera_kwargs = camera_kwargs
+        # Open Camera Readers #
+        mp4_filepaths = glob.glob(recording_folderpath + "/*.mp4")
+        all_filepaths = mp4_filepaths
+        self.camera_dict = {}
+        for f in all_filepaths:
+            serial_number = f.split("/")[-1][:-4]
+            cam_type = get_camera_type(serial_number)
+            camera_kwargs.get(cam_type, {})
+            if f.endswith(".mp4"):
+                Reader = MP4Reader  # noqa: N806
+            else:
+                raise ValueError
+            self.camera_dict[serial_number] = Reader(f, serial_number)
+    def read_cameras(self, index=None, camera_type_dict={}, timestamp_dict={}):  # noqa: B006
+        full_obs_dict = defaultdict(dict)
+        # Read Cameras In Randomized Order #
+        all_cam_ids = list(self.camera_dict.keys())
+        # random.shuffle(all_cam_ids)
+        for cam_id in all_cam_ids:
+            if "stereo" in cam_id:
+                continue
+            try:
+                cam_type = camera_type_dict[cam_id]
+            except KeyError:
+                print(f"{self.camera_dict} -- {camera_type_dict}")
+                raise ValueError(f"Camera type {cam_id} not found in camera_type_dict")  # noqa: B904
+            curr_cam_kwargs = self.camera_kwargs.get(cam_type, {})
+            self.camera_dict[cam_id].set_reading_parameters(**curr_cam_kwargs)
+            timestamp = timestamp_dict.get(cam_id + "_frame_received", None)
+            if index is not None:
+                self.camera_dict[cam_id].set_frame_index(index)
+            data_dict = self.camera_dict[cam_id].read_camera(correct_timestamp=timestamp)
+            # Process Returned Data #
+            if data_dict is None:
+                return None
+            for key in data_dict:
+                full_obs_dict[key].update(data_dict[key])
+        return full_obs_dict
+def get_hdf5_length(hdf5_file, keys_to_ignore=[]):  # noqa: B006
+    length = None
+    for key in hdf5_file:
+        if key in keys_to_ignore:
+            continue
+        curr_data = hdf5_file[key]
+        if isinstance(curr_data, h5py.Group):
+            curr_length = get_hdf5_length(curr_data, keys_to_ignore=keys_to_ignore)
+        elif isinstance(curr_data, h5py.Dataset):
+            curr_length = len(curr_data)
+        else:
+            raise ValueError
+        if length is None:
+            length = curr_length
+        assert curr_length == length
+    return length
+def load_hdf5_to_dict(hdf5_file, index, keys_to_ignore=[]):  # noqa: B006
+    data_dict = {}
+    for key in hdf5_file:
+        if key in keys_to_ignore:
+            continue
+        curr_data = hdf5_file[key]
+        if isinstance(curr_data, h5py.Group):
+            data_dict[key] = load_hdf5_to_dict(curr_data, index, keys_to_ignore=keys_to_ignore)
+        elif isinstance(curr_data, h5py.Dataset):
+            data_dict[key] = curr_data[index]
+        else:
+            raise ValueError
+    return data_dict
+class TrajectoryReader:
+    def __init__(self, filepath, read_images=True):  # noqa: FBT002
+        self._hdf5_file = h5py.File(filepath, "r")
+        is_video_folder = "observations/videos" in self._hdf5_file
+        self._read_images = read_images and is_video_folder
+        self._length = get_hdf5_length(self._hdf5_file)
+        self._video_readers = {}
+        self._index = 0
+    def length(self):
+        return self._length
+    def read_timestep(self, index=None, keys_to_ignore=[]):  # noqa: B006
+        # Make Sure We Read Within Range #
+        if index is None:
+            index = self._index
+        else:
+            assert not self._read_images
+            self._index = index
+        assert index < self._length
+        # Load Low Dimensional Data #
+        keys_to_ignore = [*keys_to_ignore.copy(), "videos"]
+        timestep = load_hdf5_to_dict(self._hdf5_file, self._index, keys_to_ignore=keys_to_ignore)
+        # Increment Read Index #
+        self._index += 1
+        # Return Timestep #
+        return timestep
+    def close(self):
+        self._hdf5_file.close()
+def load_trajectory(
+    filepath=None,
+    read_cameras=True,  # noqa: FBT002
+    recording_folderpath=None,
+    camera_kwargs={},  # noqa: B006
+    remove_skipped_steps=False,  # noqa: FBT002
+    num_samples_per_traj=None,
+    num_samples_per_traj_coeff=1.5,
+):
+    read_recording_folderpath = read_cameras and (recording_folderpath is not None)
+    traj_reader = TrajectoryReader(filepath)
+    if read_recording_folderpath:
+        camera_reader = RecordedMultiCameraWrapper(recording_folderpath, camera_kwargs)
+    horizon = traj_reader.length()
+    timestep_list = []
+    # Choose Timesteps To Save #
+    if num_samples_per_traj:
+        num_to_save = num_samples_per_traj
+        if remove_skipped_steps:
+            num_to_save = int(num_to_save * num_samples_per_traj_coeff)
+        max_size = min(num_to_save, horizon)
+        indices_to_save = np.sort(np.random.choice(horizon, size=max_size, replace=False))
+    else:
+        indices_to_save = np.arange(horizon)
+    # Iterate Over Trajectory #
+    for i in indices_to_save:
+        # Get HDF5 Data #
+        timestep = traj_reader.read_timestep(index=i)
+        # If Applicable, Get Recorded Data #
+        if read_recording_folderpath:
+            timestamp_dict = timestep["observation"]["timestamp"]["cameras"]
+            camera_type_dict = {
+                k: camera_type_to_string_dict[v] for k, v in timestep["observation"]["camera_type"].items()
+            }
+            camera_obs = camera_reader.read_cameras(
+                index=i, camera_type_dict=camera_type_dict, timestamp_dict=timestamp_dict
+            )
+            camera_failed = camera_obs is None
+            # Add Data To Timestep If Successful #
+            if camera_failed:
+                break
+            timestep["observation"].update(camera_obs)
+        # Filter Steps #
+        step_skipped = not timestep["observation"]["controller_info"].get("movement_enabled", True)
+        delete_skipped_step = step_skipped and remove_skipped_steps
+        # Save Filtered Timesteps #
+        if delete_skipped_step:
+            del timestep
+        else:
+            timestep_list.append(timestep)
+    # Remove Extra Transitions #
+    timestep_list = np.array(timestep_list)
+    if (num_samples_per_traj is not None) and (len(timestep_list) > num_samples_per_traj):
+        ind_to_keep = np.random.choice(len(timestep_list), size=num_samples_per_traj, replace=False)
+        timestep_list = timestep_list[ind_to_keep]
+    # Close Readers #
+    traj_reader.close()
+    # Return Data #
+    return timestep_list
+if __name__ == "__main__":
+    tyro.cli(main)

capvector-pi05/examples/droid/main.py ADDED Viewed

	@@ -0,0 +1,246 @@

+# ruff: noqa
+import contextlib
+import dataclasses
+import datetime
+import faulthandler
+import os
+import signal
+import time
+from moviepy.editor import ImageSequenceClip
+import numpy as np
+from openpi_client import image_tools
+from openpi_client import websocket_client_policy
+import pandas as pd
+from PIL import Image
+from droid.robot_env import RobotEnv
+import tqdm
+import tyro
+faulthandler.enable()
+# DROID data collection frequency -- we slow down execution to match this frequency
+DROID_CONTROL_FREQUENCY = 15
+@dataclasses.dataclass
+class Args:
+    # Hardware parameters
+    left_camera_id: str = "<your_camera_id>"  # e.g., "24259877"
+    right_camera_id: str = "<your_camera_id>"  # e.g., "24514023"
+    wrist_camera_id: str = "<your_camera_id>"  # e.g., "13062452"
+    # Policy parameters
+    external_camera: str | None = (
+        None  # which external camera should be fed to the policy, choose from ["left", "right"]
+    )
+    # Rollout parameters
+    max_timesteps: int = 600
+    # How many actions to execute from a predicted action chunk before querying policy server again
+    # 8 is usually a good default (equals 0.5 seconds of action execution).
+    open_loop_horizon: int = 8
+    # Remote server parameters
+    remote_host: str = "0.0.0.0"  # point this to the IP address of the policy server, e.g., "192.168.1.100"
+    remote_port: int = (
+        8000  # point this to the port of the policy server, default server port for openpi servers is 8000
+    )
+# We are using Ctrl+C to optionally terminate rollouts early -- however, if we press Ctrl+C while the policy server is
+# waiting for a new action chunk, it will raise an exception and the server connection dies.
+# This context manager temporarily prevents Ctrl+C and delays it after the server call is complete.
+@contextlib.contextmanager
+def prevent_keyboard_interrupt():
+    """Temporarily prevent keyboard interrupts by delaying them until after the protected code."""
+    interrupted = False
+    original_handler = signal.getsignal(signal.SIGINT)
+    def handler(signum, frame):
+        nonlocal interrupted
+        interrupted = True
+    signal.signal(signal.SIGINT, handler)
+    try:
+        yield
+    finally:
+        signal.signal(signal.SIGINT, original_handler)
+        if interrupted:
+            raise KeyboardInterrupt
+def main(args: Args):
+    # Make sure external camera is specified by user -- we only use one external camera for the policy
+    assert (
+        args.external_camera is not None and args.external_camera in ["left", "right"]
+    ), f"Please specify an external camera to use for the policy, choose from ['left', 'right'], but got {args.external_camera}"
+    # Initialize the Panda environment. Using joint velocity action space and gripper position action space is very important.
+    env = RobotEnv(action_space="joint_velocity", gripper_action_space="position")
+    print("Created the droid env!")
+    # Connect to the policy server
+    policy_client = websocket_client_policy.WebsocketClientPolicy(args.remote_host, args.remote_port)
+    df = pd.DataFrame(columns=["success", "duration", "video_filename"])
+    while True:
+        instruction = input("Enter instruction: ")
+        # Rollout parameters
+        actions_from_chunk_completed = 0
+        pred_action_chunk = None
+        # Prepare to save video of rollout
+        timestamp = datetime.datetime.now().strftime("%Y_%m_%d_%H:%M:%S")
+        video = []
+        bar = tqdm.tqdm(range(args.max_timesteps))
+        print("Running rollout... press Ctrl+C to stop early.")
+        for t_step in bar:
+            start_time = time.time()
+            try:
+                # Get the current observation
+                curr_obs = _extract_observation(
+                    args,
+                    env.get_observation(),
+                    # Save the first observation to disk
+                    save_to_disk=t_step == 0,
+                )
+                video.append(curr_obs[f"{args.external_camera}_image"])
+                # Send websocket request to policy server if it's time to predict a new chunk
+                if actions_from_chunk_completed == 0 or actions_from_chunk_completed >= args.open_loop_horizon:
+                    actions_from_chunk_completed = 0
+                    # We resize images on the robot laptop to minimize the amount of data sent to the policy server
+                    # and improve latency.
+                    request_data = {
+                        "observation/exterior_image_1_left": image_tools.resize_with_pad(
+                            curr_obs[f"{args.external_camera}_image"], 224, 224
+                        ),
+                        "observation/wrist_image_left": image_tools.resize_with_pad(curr_obs["wrist_image"], 224, 224),
+                        "observation/joint_position": curr_obs["joint_position"],
+                        "observation/gripper_position": curr_obs["gripper_position"],
+                        "prompt": instruction,
+                    }
+                    # Wrap the server call in a context manager to prevent Ctrl+C from interrupting it
+                    # Ctrl+C will be handled after the server call is complete
+                    with prevent_keyboard_interrupt():
+                        # this returns action chunk [10, 8] of 10 joint velocity actions (7) + gripper position (1)
+                        pred_action_chunk = policy_client.infer(request_data)["actions"]
+                    assert pred_action_chunk.shape == (10, 8)
+                # Select current action to execute from chunk
+                action = pred_action_chunk[actions_from_chunk_completed]
+                actions_from_chunk_completed += 1
+                # Binarize gripper action
+                if action[-1].item() > 0.5:
+                    # action[-1] = 1.0
+                    action = np.concatenate([action[:-1], np.ones((1,))])
+                else:
+                    # action[-1] = 0.0
+                    action = np.concatenate([action[:-1], np.zeros((1,))])
+                # clip all dimensions of action to [-1, 1]
+                action = np.clip(action, -1, 1)
+                env.step(action)
+                # Sleep to match DROID data collection frequency
+                elapsed_time = time.time() - start_time
+                if elapsed_time < 1 / DROID_CONTROL_FREQUENCY:
+                    time.sleep(1 / DROID_CONTROL_FREQUENCY - elapsed_time)
+            except KeyboardInterrupt:
+                break
+        video = np.stack(video)
+        save_filename = "video_" + timestamp
+        ImageSequenceClip(list(video), fps=10).write_videofile(save_filename + ".mp4", codec="libx264")
+        success: str | float | None = None
+        while not isinstance(success, float):
+            success = input(
+                "Did the rollout succeed? (enter y for 100%, n for 0%), or a numeric value 0-100 based on the evaluation spec"
+            )
+            if success == "y":
+                success = 1.0
+            elif success == "n":
+                success = 0.0
+            success = float(success) / 100
+            if not (0 <= success <= 1):
+                print(f"Success must be a number in [0, 100] but got: {success * 100}")
+        df = df.append(
+            {
+                "success": success,
+                "duration": t_step,
+                "video_filename": save_filename,
+            },
+            ignore_index=True,
+        )
+        if input("Do one more eval? (enter y or n) ").lower() != "y":
+            break
+        env.reset()
+    os.makedirs("results", exist_ok=True)
+    timestamp = datetime.datetime.now().strftime("%I:%M%p_%B_%d_%Y")
+    csv_filename = os.path.join("results", f"eval_{timestamp}.csv")
+    df.to_csv(csv_filename)
+    print(f"Results saved to {csv_filename}")
+def _extract_observation(args: Args, obs_dict, *, save_to_disk=False):
+    image_observations = obs_dict["image"]
+    left_image, right_image, wrist_image = None, None, None
+    for key in image_observations:
+        # Note the "left" below refers to the left camera in the stereo pair.
+        # The model is only trained on left stereo cams, so we only feed those.
+        if args.left_camera_id in key and "left" in key:
+            left_image = image_observations[key]
+        elif args.right_camera_id in key and "left" in key:
+            right_image = image_observations[key]
+        elif args.wrist_camera_id in key and "left" in key:
+            wrist_image = image_observations[key]
+    # Drop the alpha dimension
+    left_image = left_image[..., :3]
+    right_image = right_image[..., :3]
+    wrist_image = wrist_image[..., :3]
+    # Convert to RGB
+    left_image = left_image[..., ::-1]
+    right_image = right_image[..., ::-1]
+    wrist_image = wrist_image[..., ::-1]
+    # In addition to image observations, also capture the proprioceptive state
+    robot_state = obs_dict["robot_state"]
+    cartesian_position = np.array(robot_state["cartesian_position"])
+    joint_position = np.array(robot_state["joint_positions"])
+    gripper_position = np.array([robot_state["gripper_position"]])
+    # Save the images to disk so that they can be viewed live while the robot is running
+    # Create one combined image to make live viewing easy
+    if save_to_disk:
+        combined_image = np.concatenate([left_image, wrist_image, right_image], axis=1)
+        combined_image = Image.fromarray(combined_image)
+        combined_image.save("robot_camera_views.png")
+    return {
+        "left_image": left_image,
+        "right_image": right_image,
+        "wrist_image": wrist_image,
+        "cartesian_position": cartesian_position,
+        "joint_position": joint_position,
+        "gripper_position": gripper_position,
+    }
+if __name__ == "__main__":
+    args: Args = tyro.cli(Args)
+    main(args)

capvector-pi05/examples/inference.ipynb ADDED Viewed

	@@ -0,0 +1,137 @@

+{
+    "cells": [
+        {
+            "cell_type": "code",
+            "execution_count": 1,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "import dataclasses\n",
+                "\n",
+                "import jax\n",
+                "\n",
+                "from openpi.models import model as _model\n",
+                "from openpi.policies import droid_policy\n",
+                "from openpi.policies import policy_config as _policy_config\n",
+                "from openpi.shared import download\n",
+                "from openpi.training import config as _config\n",
+                "from openpi.training import data_loader as _data_loader"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Policy inference\n",
+                "\n",
+                "The following example shows how to create a policy from a checkpoint and run inference on a dummy example."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "config = _config.get_config(\"pi0_fast_droid\")\n",
+                "checkpoint_dir = download.maybe_download(\"gs://openpi-assets/checkpoints/pi0_fast_droid\")\n",
+                "\n",
+                "# Create a trained policy.\n",
+                "policy = _policy_config.create_trained_policy(config, checkpoint_dir)\n",
+                "\n",
+                "# Run inference on a dummy example. This example corresponds to observations produced by the DROID runtime.\n",
+                "example = droid_policy.make_droid_example()\n",
+                "result = policy.infer(example)\n",
+                "\n",
+                "# Delete the policy to free up memory.\n",
+                "del policy\n",
+                "\n",
+                "print(\"Actions shape:\", result[\"actions\"].shape)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "# Working with a live model\n",
+                "\n",
+                "\n",
+                "The following example shows how to create a live model from a checkpoint and compute training loss. First, we are going to demonstrate how to do it with fake data.\n"
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "config = _config.get_config(\"pi0_aloha_sim\")\n",
+                "\n",
+                "checkpoint_dir = download.maybe_download(\"gs://openpi-assets/checkpoints/pi0_aloha_sim\")\n",
+                "key = jax.random.key(0)\n",
+                "\n",
+                "# Create a model from the checkpoint.\n",
+                "model = config.model.load(_model.restore_params(checkpoint_dir / \"params\"))\n",
+                "\n",
+                "# We can create fake observations and actions to test the model.\n",
+                "obs, act = config.model.fake_obs(), config.model.fake_act()\n",
+                "\n",
+                "# Sample actions from the model.\n",
+                "loss = model.compute_loss(key, obs, act)\n",
+                "print(\"Loss shape:\", loss.shape)"
+            ]
+        },
+        {
+            "cell_type": "markdown",
+            "metadata": {},
+            "source": [
+                "Now, we are going to create a data loader and use a real batch of training data to compute the loss."
+            ]
+        },
+        {
+            "cell_type": "code",
+            "execution_count": null,
+            "metadata": {},
+            "outputs": [],
+            "source": [
+                "# Reduce the batch size to reduce memory usage.\n",
+                "config = dataclasses.replace(config, batch_size=2)\n",
+                "\n",
+                "# Load a single batch of data. This is the same data that will be used during training.\n",
+                "# NOTE: In order to make this example self-contained, we are skipping the normalization step\n",
+                "# since it requires the normalization statistics to be generated using `compute_norm_stats`.\n",
+                "loader = _data_loader.create_data_loader(config, num_batches=1, skip_norm_stats=True)\n",
+                "obs, act = next(iter(loader))\n",
+                "\n",
+                "# Sample actions from the model.\n",
+                "loss = model.compute_loss(key, obs, act)\n",
+                "\n",
+                "# Delete the model to free up memory.\n",
+                "del model\n",
+                "\n",
+                "print(\"Loss shape:\", loss.shape)"
+            ]
+        }
+    ],
+    "metadata": {
+        "kernelspec": {
+            "display_name": ".venv",
+            "language": "python",
+            "name": "python3"
+        },
+        "language_info": {
+            "codemirror_mode": {
+                "name": "ipython",
+                "version": 3
+            },
+            "file_extension": ".py",
+            "mimetype": "text/x-python",
+            "name": "python",
+            "nbconvert_exporter": "python",
+            "pygments_lexer": "ipython3",
+            "version": "3.11.9"
+        }
+    },
+    "nbformat": 4,
+    "nbformat_minor": 2
+}

capvector-pi05/examples/libero/compose.yml ADDED Viewed

	@@ -0,0 +1,54 @@

+# Run with:
+# docker compose -f examples/libero/compose.yml up --build
+services:
+  runtime:
+    image: libero
+    depends_on:
+      - openpi_server
+    build:
+      context: ../..
+      dockerfile: examples/libero/Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    privileged: true
+    volumes:
+      - $PWD:/app
+      - ../../data:/data
+      - /tmp/.X11-unix:/tmp/.X11-unix:ro
+    environment:
+      - CLIENT_ARGS
+      - DISPLAY=$DISPLAY
+      - MUJOCO_GL=${MUJOCO_GL:-egl}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]
+  openpi_server:
+    image: openpi_server
+    build:
+      context: ../..
+      dockerfile: scripts/docker/serve_policy.Dockerfile
+    init: true
+    tty: true
+    network_mode: host
+    volumes:
+      - $PWD:/app
+      - ${OPENPI_DATA_HOME:-~/.cache/openpi}:/openpi_assets
+    environment:
+      - SERVER_ARGS
+      - OPENPI_DATA_HOME=/openpi_assets
+      - IS_DOCKER=true
+    # Comment out this block if not running on a machine with GPUs.
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: 1
+              capabilities: [gpu]

capvector-pi05/examples/libero/convert_libero_data_to_lerobot.py ADDED Viewed

	@@ -0,0 +1,104 @@

+"""
+Minimal example script for converting a dataset to LeRobot format.
+We use the Libero dataset (stored in RLDS) for this example, but it can be easily
+modified for any other data you have saved in a custom format.
+Usage:
+uv run examples/libero/convert_libero_data_to_lerobot.py --data_dir /path/to/your/data
+If you want to push your dataset to the Hugging Face Hub, you can use the following command:
+uv run examples/libero/convert_libero_data_to_lerobot.py --data_dir /path/to/your/data --push_to_hub
+Note: to run the script, you need to install tensorflow_datasets:
+`uv pip install tensorflow tensorflow_datasets`
+You can download the raw Libero datasets from https://huggingface.co/datasets/openvla/modified_libero_rlds
+The resulting dataset will get saved to the $HF_LEROBOT_HOME directory.
+Running this conversion script will take approximately 30 minutes.
+"""
+import shutil
+from lerobot.common.datasets.lerobot_dataset import HF_LEROBOT_HOME
+from lerobot.common.datasets.lerobot_dataset import LeRobotDataset
+import tensorflow_datasets as tfds
+import tyro
+REPO_NAME = "your_hf_username/libero"  # Name of the output dataset, also used for the Hugging Face Hub
+RAW_DATASET_NAMES = [
+    "libero_10_no_noops",
+    "libero_goal_no_noops",
+    "libero_object_no_noops",
+    "libero_spatial_no_noops",
+]  # For simplicity we will combine multiple Libero datasets into one training dataset
+def main(data_dir: str, *, push_to_hub: bool = False):
+    # Clean up any existing dataset in the output directory
+    output_path = HF_LEROBOT_HOME / REPO_NAME
+    if output_path.exists():
+        shutil.rmtree(output_path)
+    # Create LeRobot dataset, define features to store
+    # OpenPi assumes that proprio is stored in `state` and actions in `action`
+    # LeRobot assumes that dtype of image data is `image`
+    dataset = LeRobotDataset.create(
+        repo_id=REPO_NAME,
+        robot_type="panda",
+        fps=10,
+        features={
+            "image": {
+                "dtype": "image",
+                "shape": (256, 256, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "wrist_image": {
+                "dtype": "image",
+                "shape": (256, 256, 3),
+                "names": ["height", "width", "channel"],
+            },
+            "state": {
+                "dtype": "float32",
+                "shape": (8,),
+                "names": ["state"],
+            },
+            "actions": {
+                "dtype": "float32",
+                "shape": (7,),
+                "names": ["actions"],
+            },
+        },
+        image_writer_threads=10,
+        image_writer_processes=5,
+    )
+    # Loop over raw Libero datasets and write episodes to the LeRobot dataset
+    # You can modify this for your own data format
+    for raw_dataset_name in RAW_DATASET_NAMES:
+        raw_dataset = tfds.load(raw_dataset_name, data_dir=data_dir, split="train")
+        for episode in raw_dataset:
+            for step in episode["steps"].as_numpy_iterator():
+                dataset.add_frame(
+                    {
+                        "image": step["observation"]["image"],
+                        "wrist_image": step["observation"]["wrist_image"],
+                        "state": step["observation"]["state"],
+                        "actions": step["action"],
+                        "task": step["language_instruction"].decode(),
+                    }
+                )
+            dataset.save_episode()
+    # Optionally push to the Hugging Face Hub
+    if push_to_hub:
+        dataset.push_to_hub(
+            tags=["libero", "panda", "rlds"],
+            private=False,
+            push_videos=True,
+            license="apache-2.0",
+        )
+if __name__ == "__main__":
+    tyro.cli(main)

capvector-pi05/examples/policy_records.ipynb ADDED Viewed

	@@ -0,0 +1,134 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pathlib\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "record_path = pathlib.Path(\"../policy_records\")\n",
+    "num_steps = len(list(record_path.glob(\"step_*.npy\")))\n",
+    "\n",
+    "records = []\n",
+    "for i in range(num_steps):\n",
+    "    record = np.load(record_path / f\"step_{i}.npy\", allow_pickle=True).item()\n",
+    "    records.append(record)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"length of records\", len(records))\n",
+    "print(\"keys in records\", records[0].keys())\n",
+    "\n",
+    "for k in records[0]:\n",
+    "    print(f\"{k} shape: {records[0][k].shape}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "\n",
+    "\n",
+    "def get_image(step: int, idx: int = 0):\n",
+    "    img = (255 * records[step][\"inputs/image\"]).astype(np.uint8)\n",
+    "    return img[idx].transpose(1, 2, 0)\n",
+    "\n",
+    "\n",
+    "def show_image(step: int, idx_lst: list[int]):\n",
+    "    imgs = [get_image(step, idx) for idx in idx_lst]\n",
+    "    return Image.fromarray(np.hstack(imgs))\n",
+    "\n",
+    "\n",
+    "for i in range(2):\n",
+    "    display(show_image(i, [0]))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "\n",
+    "def get_axis(name, axis):\n",
+    "    return np.array([record[name][axis] for record in records])\n",
+    "\n",
+    "\n",
+    "# qpos is [..., 14] of type float:\n",
+    "# 0-5: left arm joint angles\n",
+    "# 6: left arm gripper\n",
+    "# 7-12: right arm joint angles\n",
+    "# 13: right arm gripper\n",
+    "names = [(\"left_joint\", 6), (\"left_gripper\", 1), (\"right_joint\", 6), (\"right_gripper\", 1)]\n",
+    "\n",
+    "\n",
+    "def make_data():\n",
+    "    cur_dim = 0\n",
+    "    in_data = {}\n",
+    "    out_data = {}\n",
+    "    for name, dim_size in names:\n",
+    "        for i in range(dim_size):\n",
+    "            in_data[f\"{name}_{i}\"] = get_axis(\"inputs/qpos\", cur_dim)\n",
+    "            out_data[f\"{name}_{i}\"] = get_axis(\"outputs/qpos\", cur_dim)\n",
+    "            cur_dim += 1\n",
+    "    return pd.DataFrame(in_data), pd.DataFrame(out_data)\n",
+    "\n",
+    "\n",
+    "in_data, out_data = make_data()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for name in in_data.columns:\n",
+    "    data = pd.DataFrame({f\"in_{name}\": in_data[name], f\"out_{name}\": out_data[name]})\n",
+    "    data.plot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

capvector-pi05/pyproject.toml ADDED Viewed

	@@ -0,0 +1,142 @@

+[project]
+name = "openpi"
+version = "0.1.0"
+description = "Physical Intelligence open source repo"
+readme = "README.md"
+requires-python = ">=3.11"
+license = { file = "LICENSE" }
+dependencies = [
+    "augmax>=0.3.4",
+    "dm-tree>=0.1.8",
+    "einops>=0.8.0",
+    "equinox>=0.11.8",
+    "flatbuffers>=24.3.25",
+    "flax==0.10.2",
+    "fsspec[gcs]>=2024.6.0",
+    "gym-aloha>=0.1.1",
+    "imageio>=2.36.1",
+    "jax[cuda12]==0.5.3",
+    "jaxtyping==0.2.36",
+    "lerobot",
+    "ml_collections==1.0.0",
+    "numpy>=1.22.4,<2.0.0",
+    "numpydantic>=1.6.6",
+    "opencv-python>=4.10.0.84",
+    "openpi-client",
+    "orbax-checkpoint==0.11.13",
+    "pillow>=11.0.0",
+    "sentencepiece>=0.2.0",
+    "torch==2.7.1",
+    "tqdm-loggable>=0.2",
+    "typing-extensions>=4.12.2",
+    "tyro>=0.9.5",
+    "wandb>=0.19.1",
+    "filelock>=3.16.1",
+    "beartype==0.19.0",
+    "treescope>=0.1.7",
+    "transformers==4.53.2",
+    "rich>=14.0.0",
+    "polars>=1.30.0",
+    "gradio==5.17.1",
+    "viser==0.2.23",
+    "hydra-core",
+    "onnxruntime",
+    "safetensors",
+]
+[project.urls]
+Repository = "https://github.com/Physical-Intelligence/openpi"
+[dependency-groups]
+dev = [
+    "pytest>=8.3.4",
+    "ruff>=0.8.6",
+    "pre-commit>=4.0.1",
+    "ipykernel>=6.29.5",
+    "ipywidgets>=8.1.5",
+    "matplotlib>=3.10.0",
+    "pynvml>=12.0.0",
+]
+rlds = [
+    "dlimp",
+    "tensorflow-cpu==2.15.0",
+    "tensorflow-datasets==4.9.9",
+]
+[tool.uv]
+override-dependencies = ["datasets==3.6.0", "ml-dtypes==0.4.1", "tensorstore==0.1.74"]
+[tool.uv.sources]
+openpi-client = { workspace = true }
+lerobot = { git = "https://github.com/huggingface/lerobot", rev = "0cf864870cf29f4738d3ade893e6fd13fbd7cdb5" }
+dlimp = { git = "https://github.com/kvablack/dlimp", rev = "ad72ce3a9b414db2185bc0b38461d4101a65477a" }
+[tool.uv.workspace]
+members = ["packages/*", "src/vggt"]
+[tool.ruff]
+line-length = 120
+target-version = "py311"
+extend-exclude = ["docker", "third_party", "src/openpi/models_pytorch/transformers_replace/*"]
+[tool.ruff.lint]
+# https://docs.astral.sh/ruff/rules/
+select = [
+    "B",
+    "C4",
+    "DTZ",
+    "E4",
+    "E7",
+    "E9",
+    "F",
+    "FBT",
+    "FURB",
+    "I",
+    "ICN",
+    "ISC",
+    "LOG",
+    "N",
+    "PD",
+    "PERF",
+    "PIE",
+    "PLC",
+    "PLE",
+    "PLR1",
+    "PLR5",
+    "PLW",
+    "PT",
+    "Q",
+    "RET",
+    "RUF",
+    "SIM",
+    "SLF",
+    "T10",
+    "T20",
+    "UP",
+    "W",
+]
+ignore = [
+    "F722",   # Conflicts with array typing.
+    "T201",   # We use print statements.
+    "PD008",  # Lots of false positives.
+    "ISC001", # Disabling to support ruff format.
+    "LOG015", # Use logger.info.
+]
+unfixable = [
+    "B905", # Fix defaults to strict=False, which is not what we want.
+]
+[tool.ruff.lint.isort]
+force-single-line = true
+force-sort-within-sections = true
+single-line-exclusions = ["collections.abc", "typing", "typing_extensions"]
+known-third-party = ["wandb"]
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.pytest.ini_options]
+markers = ["manual: should be run manually."]
+testpaths = ["src", "scripts", "packages"]