iMihayo commited on Jul 12

Commit

e188f3d

verified ·

1 Parent(s): dfa767d

Add files using upload-large-folder tool

Browse files

Files changed (41) hide show

prismatic/__init__.py +1 -0
prismatic/conf/datasets.py +133 -0
prismatic/conf/vla.py +235 -0
prismatic/models/backbones/__init__.py +0 -0
prismatic/models/backbones/vision/dinov2_vit.py +19 -0
prismatic/models/load.py +226 -0
prismatic/models/materialize.py +130 -0
prismatic/models/projectors.py +67 -0
prismatic/preprocessing/datasets/datasets.py +200 -0
prismatic/preprocessing/materialize.py +69 -0
prismatic/py.typed +0 -0
prismatic/util/nn_utils.py +53 -0
prismatic/vla/datasets/rlds/__init__.py +1 -0
prismatic/vla/datasets/rlds/dataset.py +655 -0
prismatic/vla/datasets/rlds/oxe/transforms.py +951 -0
prismatic/vla/datasets/rlds/oxe/utils/droid_utils.py +178 -0
prismatic/vla/datasets/rlds/utils/__init__.py +0 -0
prismatic/vla/datasets/rlds/utils/goal_relabeling.py +32 -0
prismatic/vla/materialize.py +56 -0
run_scripts/ac/ac.sh +87 -0
run_scripts/ffn/3ffn2.sh +87 -0
run_scripts/ffn/3postffn2.sh +87 -0
run_scripts/ffn/3postffn6.sh +87 -0
run_scripts/ffn/debug_5ffn_withactionprojector.sh +87 -0
run_scripts/ffn/ffn4.sh +87 -0
run_scripts/ffn/ffn8.sh +87 -0
run_scripts/ffn/test.sh +87 -0
run_scripts/ffn_long_chunks/run.sh +4 -0
run_scripts/ffn_q2a/aloha/test_aloha_robotwin2_ffn_25_base.sh +88 -0
run_scripts/ffn_q2a/aloha/test_aloha_robotwin2_ffn_50_l2.sh +102 -0
run_scripts/ffn_q2a/bridge/exffn_relu_connector_linear_relu.sh +95 -0
run_scripts/ffn_q2a/bridge/run_bridge.sh +2 -0
run_scripts/ffn_q2a/franka/exffn_gelu_franka.sh +95 -0
run_scripts/ffn_q2a/libero_moe/debug_moe_lit.sh +101 -0
run_scripts/ffn_q2a/simhead/simhead_contrastive.sh +100 -0
run_scripts/pp/pp.sh +87 -0
run_scripts/run.sh +35 -0
scripts/extern/verify_prismatic.py +134 -0
scripts/pretrain.py +238 -0
test_deepseek_moe.py +246 -0
vla-scripts/extern/convert_openvla_weights_to_hf.py +272 -0

prismatic/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .models import available_model_names, available_models, get_model_description, load

prismatic/conf/datasets.py ADDED Viewed

	@@ -0,0 +1,133 @@

+"""
+datasets.py
+Draccus Dataclass Definition for a DatasetConfig object, with various registered subclasses for each dataset variant
+and processing scheme. A given dataset variant (e.g., `llava-lightning`) configures the following attributes:
+    - Dataset Variant (Identifier) --> e.g., "llava-v15"
+    - Align Stage Dataset Components (annotations, images)
+    - Finetune Stage Dataset Components (annotations, images)
+    - Dataset Root Directory (Path)
+"""
+from dataclasses import dataclass
+from enum import Enum, unique
+from pathlib import Path
+from typing import Tuple
+from draccus import ChoiceRegistry
+@dataclass
+class DatasetConfig(ChoiceRegistry):
+    # fmt: off
+    dataset_id: str                                 # Unique ID that fully specifies a dataset variant
+    # Dataset Components for each Stage in < align | finetune >
+    align_stage_components: Tuple[Path, Path]       # Path to annotation file and images directory for `align` stage
+    finetune_stage_components: Tuple[Path, Path]    # Path to annotation file and images directory for `finetune` stage
+    dataset_root_dir: Path                          # Path to dataset root directory; others paths are relative to root
+    # fmt: on
+# [Reproduction] LLaVa-v15 (exact dataset used in all public LLaVa-v15 models)
+@dataclass
+class LLaVa_V15_Config(DatasetConfig):
+    dataset_id: str = "llava-v15"
+    align_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-laion-cc-sbu-558k/chat.json"),
+        Path("download/llava-laion-cc-sbu-558k/"),
+    )
+    finetune_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-v1.5-instruct/llava_v1_5_mix665k.json"),
+        Path("download/llava-v1.5-instruct/"),
+    )
+    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
+# [Multimodal-Only] LLava-v15 WITHOUT the Language-Only ShareGPT Data (No Co-Training)
+@dataclass
+class LLaVa_Multimodal_Only_Config(DatasetConfig):
+    dataset_id: str = "llava-multimodal"
+    align_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-laion-cc-sbu-558k/chat.json"),
+        Path("download/llava-laion-cc-sbu-558k/"),
+    )
+    finetune_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-v1.5-instruct/llava_v1_5_stripped625k.json"),
+        Path("download/llava-v1.5-instruct/"),
+    )
+    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
+# LLaVa-v15 + LVIS-Instruct-4V
+@dataclass
+class LLaVa_LVIS4V_Config(DatasetConfig):
+    dataset_id: str = "llava-lvis4v"
+    align_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-laion-cc-sbu-558k/chat.json"),
+        Path("download/llava-laion-cc-sbu-558k/"),
+    )
+    finetune_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-v1.5-instruct/llava_v1_5_lvis4v_mix888k.json"),
+        Path("download/llava-v1.5-instruct/"),
+    )
+    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
+# LLaVa-v15 + LRV-Instruct
+@dataclass
+class LLaVa_LRV_Config(DatasetConfig):
+    dataset_id: str = "llava-lrv"
+    align_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-laion-cc-sbu-558k/chat.json"),
+        Path("download/llava-laion-cc-sbu-558k/"),
+    )
+    finetune_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-v1.5-instruct/llava_v1_5_lrv_mix1008k.json"),
+        Path("download/llava-v1.5-instruct/"),
+    )
+    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
+# LLaVa-v15 + LVIS-Instruct-4V + LRV-Instruct
+@dataclass
+class LLaVa_LVIS4V_LRV_Config(DatasetConfig):
+    dataset_id: str = "llava-lvis4v-lrv"
+    align_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-laion-cc-sbu-558k/chat.json"),
+        Path("download/llava-laion-cc-sbu-558k/"),
+    )
+    finetune_stage_components: Tuple[Path, Path] = (
+        Path("download/llava-v1.5-instruct/llava_v1_5_lvis4v_lrv_mix1231k.json"),
+        Path("download/llava-v1.5-instruct/"),
+    )
+    dataset_root_dir: Path = Path("/mnt/fsx/skaramcheti/datasets/prismatic-vlms")
+# === Define a Dataset Registry Enum for Reference & Validation =>> all *new* datasets must be added here! ===
+@unique
+class DatasetRegistry(Enum):
+    # === LLaVa v1.5 ===
+    LLAVA_V15 = LLaVa_V15_Config
+    LLAVA_MULTIMODAL_ONLY = LLaVa_Multimodal_Only_Config
+    LLAVA_LVIS4V = LLaVa_LVIS4V_Config
+    LLAVA_LRV = LLaVa_LRV_Config
+    LLAVA_LVIS4V_LRV = LLaVa_LVIS4V_LRV_Config
+    @property
+    def dataset_id(self) -> str:
+        return self.value.dataset_id
+# Register Datasets in Choice Registry
+for dataset_variant in DatasetRegistry:
+    DatasetConfig.register_subclass(dataset_variant.dataset_id, dataset_variant.value)

prismatic/conf/vla.py ADDED Viewed

	@@ -0,0 +1,235 @@

+"""
+vla.py
+Draccus Dataclass Definition for a VLAConfig object, with various registered subclasses for each VLA experiment and
+model configuration thereof. A given VLA model (`policy`) configures the following attributes:
+    - Data Mixture (e.g., Bridge, OXE_MAGIC_SOUP, etc.)
+    - Base VLM from Prismatic Registry (e.g., `prism-dinosiglip+7b`)
+    - VLA Model Architecture / Parameters (e.g., freeze vision encoder, last layer finetuning)
+    - Training / Optimization Hyperparameters
+"""
+from dataclasses import dataclass
+from enum import Enum, unique
+from pathlib import Path
+from typing import Optional, Union
+from draccus import ChoiceRegistry
+@dataclass
+class VLAConfig(ChoiceRegistry):
+    # fmt: off
+    vla_id: str                                     # Unique VLA Policy ID that fully specifies a configuration variant
+    base_vlm: Union[str, Path]                      # Base VLM as ID/Path to Run Directory (e.g., `prism-dinosiglip+7b`)
+    freeze_vision_backbone: bool                    # Freeze Vision Backbone Parameters (akin to pretraining)
+    freeze_llm_backbone: bool                       # Freeze LLM Backbone parameters
+    unfreeze_last_llm_layer: bool                   # Unfreeze final layer of LLM (only takes effect if LLM is frozen)
+    # Data Mixture Parameters
+    data_mix: str                                   # Open-X Embodiment Dataset =>> Unique Mixture ID (e.g., `bridge`)
+    shuffle_buffer_size: int                        # Size of Shuffle Buffer (100K for Bridge, 1M for OXE)
+    # Optimization Parameters
+    epochs: int                                     # Epochs to Run (in case `max_steps` is not specified)
+    max_steps: Optional[int]                        # [Optional] Max Gradient Steps to Run (overrides `epochs`)
+    expected_world_size: int                        # Expected # of GPUs =>> allows us to gate training on hardware
+    global_batch_size: int                          # Global Batch Size (divided across processes / world size)
+    per_device_batch_size: int                      # Per-Device Batch Size (per-process / individual GPU)
+                                                    #   =>> # of accumulation steps is auto-computed
+    learning_rate: float                            # Peak Learning Rate (`lr_scheduler_type` sets warmup/decay)
+    weight_decay: float                             # Weight Decay for AdamW Optimizer
+    max_grad_norm: float                            # Max Grad Norm (for global gradient clipping)
+    lr_scheduler_type: str                          # LR Scheduler (usually: "constant" | "linear-warmup+cosine-decay")
+    warmup_ratio: float                             # Fraction of Steps to Warmup (for warmup LR schedulers)
+    train_strategy: str                             # Train Strategy (default "fsdp-full-shard")
+    # Enable Gradient/Activation Checkpointing (for the LLM Backbone)
+    enable_gradient_checkpointing: bool = True      # Enable Gradient/Activation Checkpointing during Training
+    # Mixed Precision Training via Torch Native AMP (`autocast`)
+    enable_mixed_precision_training: bool = True    # Enable Traditional BF16 Mixed Precision
+    reduce_in_full_precision: bool = True           # Accumulate/Reduce All-Gather Gradients in FP32 Full Precision
+    # fmt: on
+# === OpenVLA Training Configurations ===
+# = [8 GPU] Fast Iteration =>> SigLIP 224px + Bridge =
+@dataclass
+class Exp_SigLIP_224px_Bridge(VLAConfig):
+    vla_id: str = "siglip-224px+mx-bridge"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    freeze_vision_backbone: bool = False
+    freeze_llm_backbone: bool = False
+    unfreeze_last_llm_layer: bool = False
+    # Data Mixture Parameters
+    data_mix: str = "bridge"
+    shuffle_buffer_size: int = 256_000
+    # Optimization Parameters
+    epochs: int = 1000
+    max_steps: Optional[int] = None
+    expected_world_size: int = 8
+    global_batch_size: int = 256
+    per_device_batch_size: int = 32
+    learning_rate: float = 2e-5
+    weight_decay: float = 0.0
+    max_grad_norm: float = 1.0
+    lr_scheduler_type: str = "constant"
+    warmup_ratio: float = 0.0
+    train_strategy: str = "fsdp-full-shard"
+# = [8 GPU] SigLIP 224px Frozen Vision Backbone + Bridge =
+@dataclass
+class Exp_FreezeVIT_SigLIP_224px_Bridge(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px-icy+mx-bridge"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    freeze_vision_backbone: bool = True
+# = [8 GPU] Fast Iteration =>> DINO-SigLIP 224px + Bridge =
+@dataclass
+class Exp_DinoSigLIP_224px_Bridge(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "prism-dinosiglip-224px+mx-bridge"
+    base_vlm: Union[str, Path] = "prism-dinosiglip-224px+7b"
+    data_mix: str = "bridge"
+# = [64 GPU] SigLIP 224px + OXE Magic Soup =
+@dataclass
+class Exp_SigLIP_224px_OXE_Magic_Soup(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px+mx-oxe-magic-soup"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    data_mix: str = "oxe_magic_soup"
+    expected_world_size: int = 64
+    global_batch_size: int = 2048
+    per_device_batch_size: int = 32
+# = [64 GPU] DINO-SigLIP 224px + OXE Magic Soup++ =
+@dataclass
+class Exp_DinoSigLIP_224px_OXE_Magic_Soup_Plus(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "prism-dinosiglip-224px+mx-oxe-magic-soup-plus"
+    base_vlm: Union[str, Path] = "prism-dinosiglip-224px+7b"
+    # Note =>> We adopt two stages, training on a mixture including DROID for 70% of training, before resampling!
+    # data_mix: str = "oxe_magic_soup_plus"
+    data_mix: str = "oxe_magic_soup_plus_minus"
+    expected_world_size: int = 64
+    global_batch_size: int = 2048
+    per_device_batch_size: int = 32
+# === OpenVLA Fine-tuning Configurations ===
+# = [8 GPU] SigLIP 224px + T-DROID =
+@dataclass
+class Exp_SigLIP_224px_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px+mx-tdroid_carrot_in_bowl"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    data_mix: str = "tdroid_carrot_in_bowl"
+@dataclass
+class Exp_SigLIP_224px_TDROID_PourCornInPot(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px+mx-tdroid_pour_corn_in_pot"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    data_mix: str = "tdroid_pour_corn_in_pot"
+# = [8 GPU] SigLIP 224px + T-DROID -- Partial Finetuning =
+@dataclass
+class Exp_SigLIP_224px_Icy_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px-icy+mx-tdroid_carrot_in_bowl"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    freeze_vision_backbone: bool = True
+    freeze_llm_backbone: bool = False
+    data_mix: str = "tdroid_carrot_in_bowl"
+@dataclass
+class Exp_SigLIP_224px_LastLayer_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px-last_layer+mx-tdroid_carrot_in_bowl"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    freeze_vision_backbone: bool = True
+    freeze_llm_backbone: bool = True
+    unfreeze_last_llm_layer: bool = True
+    data_mix: str = "tdroid_carrot_in_bowl"
+@dataclass
+class Exp_SigLIP_224px_Sandwich_TDROID_CarrotInBowl(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px-sandwich+mx-tdroid_carrot_in_bowl"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    freeze_vision_backbone: bool = False
+    freeze_llm_backbone: bool = True
+    unfreeze_last_llm_layer: bool = True
+    data_mix: str = "tdroid_carrot_in_bowl"
+# === [8 GPU] SigLIP 224px + FrankaWipe ===
+@dataclass
+class Exp_SigLIP_224px_Droid_Wipe(Exp_SigLIP_224px_Bridge):
+    vla_id: str = "siglip-224px+mx-droid_wipe"
+    base_vlm: Union[str, Path] = "siglip-224px+7b"
+    data_mix: str = "droid_wipe"
+# === Define a VLA Registry Enum for Reference & Validation ===
+@unique
+class VLARegistry(Enum):
+    # Sanity Check Configurations =>> BridgeV2
+    SIGLIP_224PX_MX_BRIDGE = Exp_SigLIP_224px_Bridge
+    DINOSIGLIP_224PX_MX_BRIDGE = Exp_DinoSigLIP_224px_Bridge
+    # SigLIP Frozen Backbone Experiment
+    FREEZE_SIGLIP_224PX_MX_BRIDGE = Exp_FreezeVIT_SigLIP_224px_Bridge
+    # [OpenVLA v0.1 7B] SigLIP 224px + OXE Magic Soup
+    SIGLIP_224PX_MX_OXE_MAGIC_SOUP = Exp_SigLIP_224px_OXE_Magic_Soup
+    # [OpenVLA 7B] DINO + SigLIP 224px + OXE Magic Soup++
+    DINOSIGLIP_224PX_MX_OXE_MAGIC_SOUP_PLUS = Exp_DinoSigLIP_224px_OXE_Magic_Soup_Plus
+    # === TDROID Fine-tuning Configs ===
+    SIGLIP_224PX_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_TDROID_CarrotInBowl
+    SIGLIP_224PX_MX_TDROID_POUR_CORN_IN_POT = Exp_SigLIP_224px_TDROID_PourCornInPot
+    SIGLIP_224PX_ICY_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_Icy_TDROID_CarrotInBowl
+    SIGLIP_224PX_LASTLAYER_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_LastLayer_TDROID_CarrotInBowl
+    SIGLIP_224PX_SANDWICH_MX_TDROID_CARROT_IN_BOWL = Exp_SigLIP_224px_Sandwich_TDROID_CarrotInBowl
+    # === DROID Fine-tuning Configs ===
+    SIGLIP_224PX_MX_DROID_WIPE = Exp_SigLIP_224px_Droid_Wipe
+    @property
+    def vla_id(self) -> str:
+        return self.value.vla_id
+# Register VLAs in Choice Registry
+for vla_variant in VLARegistry:
+    VLAConfig.register_subclass(vla_variant.vla_id, vla_variant.value)

prismatic/models/backbones/__init__.py ADDED Viewed

File without changes

prismatic/models/backbones/vision/dinov2_vit.py ADDED Viewed

	@@ -0,0 +1,19 @@

+"""
+dinov2_vit.py
+"""
+from prismatic.models.backbones.vision.base_vision import TimmViTBackbone
+# Registry =>> Supported DINOv2 Vision Backbones (from TIMM) =>> Note:: Using DINOv2 w/ Registers!
+#   => Reference: https://arxiv.org/abs/2309.16588
+DINOv2_VISION_BACKBONES = {"dinov2-vit-l": "vit_large_patch14_reg4_dinov2.lvd142m"}
+class DinoV2ViTBackbone(TimmViTBackbone):
+    def __init__(self, vision_backbone_id: str, image_resize_strategy: str, default_image_size: int = 224) -> None:
+        super().__init__(
+            vision_backbone_id,
+            DINOv2_VISION_BACKBONES[vision_backbone_id],
+            image_resize_strategy,
+            default_image_size=default_image_size,
+        )

prismatic/models/load.py ADDED Viewed

	@@ -0,0 +1,226 @@

+"""
+load.py
+Entry point for loading pretrained VLMs for inference; exposes functions for listing available models (with canonical
+IDs, mappings to paper experiments, and short descriptions), as well as for loading models (from disk or HF Hub).
+"""
+import json
+import os
+from pathlib import Path
+from typing import List, Optional, Union
+from huggingface_hub import HfFileSystem, hf_hub_download
+from prismatic.conf import ModelConfig
+from prismatic.models.materialize import get_llm_backbone_and_tokenizer, get_vision_backbone_and_transform
+from prismatic.models.registry import GLOBAL_REGISTRY, MODEL_REGISTRY
+from prismatic.models.vlas import OpenVLA
+from prismatic.models.vlms import PrismaticVLM
+from prismatic.overwatch import initialize_overwatch
+from prismatic.vla.action_tokenizer import ActionTokenizer
+# Initialize Overwatch =>> Wraps `logging.Logger`
+overwatch = initialize_overwatch(__name__)
+# === HF Hub Repository ===
+HF_HUB_REPO = "TRI-ML/prismatic-vlms"
+VLA_HF_HUB_REPO = "openvla/openvla-dev"
+# === Available Models ===
+def available_models() -> List[str]:
+    return list(MODEL_REGISTRY.keys())
+def available_model_names() -> List[str]:
+    return list(GLOBAL_REGISTRY.items())
+def get_model_description(model_id_or_name: str) -> str:
+    if model_id_or_name not in GLOBAL_REGISTRY:
+        raise ValueError(f"Couldn't find `{model_id_or_name = }; check `prismatic.available_model_names()`")
+    # Print Description & Return
+    print(json.dumps(description := GLOBAL_REGISTRY[model_id_or_name]["description"], indent=2))
+    return description
+# === Load Pretrained Model ===
+def load(
+    model_id_or_path: Union[str, Path],
+    hf_token: Optional[str] = None,
+    cache_dir: Optional[Union[str, Path]] = None,
+    load_for_training: bool = False,
+) -> PrismaticVLM:
+    """Loads a pretrained PrismaticVLM from either local disk or the HuggingFace Hub."""
+    if os.path.isdir(model_id_or_path):
+        overwatch.info(f"Loading from local path `{(run_dir := Path(model_id_or_path))}`")
+        # Get paths for `config.json` and pretrained checkpoint
+        config_json, checkpoint_pt = run_dir / "config.json", run_dir / "checkpoints" / "latest-checkpoint.pt"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert checkpoint_pt.exists(), f"Missing checkpoint for `{run_dir = }`"
+    else:
+        if model_id_or_path not in GLOBAL_REGISTRY:
+            raise ValueError(f"Couldn't find `{model_id_or_path = }; check `prismatic.available_model_names()`")
+        overwatch.info(f"Downloading `{(model_id := GLOBAL_REGISTRY[model_id_or_path]['model_id'])} from HF Hub")
+        with overwatch.local_zero_first():
+            config_json = hf_hub_download(repo_id=HF_HUB_REPO, filename=f"{model_id}/config.json", cache_dir=cache_dir)
+            checkpoint_pt = hf_hub_download(
+                repo_id=HF_HUB_REPO, filename=f"{model_id}/checkpoints/latest-checkpoint.pt", cache_dir=cache_dir
+            )
+    # Load Model Config from `config.json`
+    with open(config_json, "r") as f:
+        model_cfg = json.load(f)["model"]
+    # = Load Individual Components necessary for Instantiating a VLM =
+    #   =>> Print Minimal Config
+    overwatch.info(
+        f"Found Config =>> Loading & Freezing [bold blue]{model_cfg['model_id']}[/] with:\n"
+        f"             Vision Backbone =>> [bold]{model_cfg['vision_backbone_id']}[/]\n"
+        f"             LLM Backbone    =>> [bold]{model_cfg['llm_backbone_id']}[/]\n"
+        f"             Arch Specifier  =>> [bold]{model_cfg['arch_specifier']}[/]\n"
+        f"             Checkpoint Path =>> [underline]`{checkpoint_pt}`[/]"
+    )
+    # Load Vision Backbone
+    overwatch.info(f"Loading Vision Backbone [bold]{model_cfg['vision_backbone_id']}[/]")
+    vision_backbone, image_transform = get_vision_backbone_and_transform(
+        model_cfg["vision_backbone_id"],
+        model_cfg["image_resize_strategy"],
+    )
+    # Load LLM Backbone --> note `inference_mode = True` by default when calling `load()`
+    overwatch.info(f"Loading Pretrained LLM [bold]{model_cfg['llm_backbone_id']}[/] via HF Transformers")
+    llm_backbone, tokenizer = get_llm_backbone_and_tokenizer(
+        model_cfg["llm_backbone_id"],
+        llm_max_length=model_cfg.get("llm_max_length", 2048),
+        hf_token=hf_token,
+        inference_mode=not load_for_training,
+    )
+    # Load VLM using `from_pretrained` (clobbers HF syntax... eventually should reconcile)
+    overwatch.info(f"Loading VLM [bold blue]{model_cfg['model_id']}[/] from Checkpoint")
+    vlm = PrismaticVLM.from_pretrained(
+        checkpoint_pt,
+        model_cfg["model_id"],
+        vision_backbone,
+        llm_backbone,
+        arch_specifier=model_cfg["arch_specifier"],
+        freeze_weights=not load_for_training,
+    )
+    return vlm
+# === Load Pretrained VLA Model ===
+def load_vla(
+    model_id_or_path: Union[str, Path],
+    hf_token: Optional[str] = None,
+    cache_dir: Optional[Union[str, Path]] = None,
+    load_for_training: bool = False,
+    step_to_load: Optional[int] = None,
+    model_type: str = "pretrained",
+) -> OpenVLA:
+    """Loads a pretrained OpenVLA from either local disk or the HuggingFace Hub."""
+    # TODO (siddk, moojink) :: Unify semantics with `load()` above; right now, `load_vla()` assumes path points to
+    #   checkpoint `.pt` file, rather than the top-level run directory!
+    if os.path.isfile(model_id_or_path):
+        overwatch.info(f"Loading from local checkpoint path `{(checkpoint_pt := Path(model_id_or_path))}`")
+        # [Validate] Checkpoint Path should look like `.../<RUN_ID>/checkpoints/<CHECKPOINT_PATH>.pt`
+        assert (checkpoint_pt.suffix == ".pt") and (checkpoint_pt.parent.name == "checkpoints"), "Invalid checkpoint!"
+        run_dir = checkpoint_pt.parents[1]
+        # Get paths for `config.json`, `dataset_statistics.json` and pretrained checkpoint
+        config_json, dataset_statistics_json = run_dir / "config.json", run_dir / "dataset_statistics.json"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir = }`"
+    # Otherwise =>> try looking for a match on `model_id_or_path` on the HF Hub (`VLA_HF_HUB_REPO`)
+    else:
+        # Search HF Hub Repo via fsspec API
+        overwatch.info(f"Checking HF for `{(hf_path := str(Path(VLA_HF_HUB_REPO) / model_type / model_id_or_path))}`")
+        if not (tmpfs := HfFileSystem()).exists(hf_path):
+            raise ValueError(f"Couldn't find valid HF Hub Path `{hf_path = }`")
+        # Identify Checkpoint to Load (via `step_to_load`)
+        step_to_load = f"{step_to_load:06d}" if step_to_load is not None else None
+        valid_ckpts = tmpfs.glob(f"{hf_path}/checkpoints/step-{step_to_load if step_to_load is not None else ''}*.pt")
+        if (len(valid_ckpts) == 0) or (step_to_load is not None and len(valid_ckpts) != 1):
+            raise ValueError(f"Couldn't find a valid checkpoint to load from HF Hub Path `{hf_path}/checkpoints/")
+        # Call to `glob` will sort steps in ascending order (if `step_to_load` is None); just grab last element
+        target_ckpt = Path(valid_ckpts[-1]).name
+        overwatch.info(f"Downloading Model `{model_id_or_path}` Config & Checkpoint `{target_ckpt}`")
+        with overwatch.local_zero_first():
+            relpath = Path(model_type) / model_id_or_path
+            config_json = hf_hub_download(
+                repo_id=VLA_HF_HUB_REPO, filename=f"{(relpath / 'config.json')!s}", cache_dir=cache_dir
+            )
+            dataset_statistics_json = hf_hub_download(
+                repo_id=VLA_HF_HUB_REPO, filename=f"{(relpath / 'dataset_statistics.json')!s}", cache_dir=cache_dir
+            )
+            checkpoint_pt = hf_hub_download(
+                repo_id=VLA_HF_HUB_REPO, filename=f"{(relpath / 'checkpoints' / target_ckpt)!s}", cache_dir=cache_dir
+            )
+    # Load VLA Config (and corresponding base VLM `ModelConfig`) from `config.json`
+    with open(config_json, "r") as f:
+        vla_cfg = json.load(f)["vla"]
+        model_cfg = ModelConfig.get_choice_class(vla_cfg["base_vlm"])()
+    # Load Dataset Statistics for Action Denormalization
+    with open(dataset_statistics_json, "r") as f:
+        norm_stats = json.load(f)
+    # = Load Individual Components necessary for Instantiating a VLA (via base VLM components) =
+    #   =>> Print Minimal Config
+    overwatch.info(
+        f"Found Config =>> Loading & Freezing [bold blue]{model_cfg.model_id}[/] with:\n"
+        f"             Vision Backbone =>> [bold]{model_cfg.vision_backbone_id}[/]\n"
+        f"             LLM Backbone    =>> [bold]{model_cfg.llm_backbone_id}[/]\n"
+        f"             Arch Specifier  =>> [bold]{model_cfg.arch_specifier}[/]\n"
+        f"             Checkpoint Path =>> [underline]`{checkpoint_pt}`[/]"
+    )
+    # Load Vision Backbone
+    overwatch.info(f"Loading Vision Backbone [bold]{model_cfg.vision_backbone_id}[/]")
+    vision_backbone, image_transform = get_vision_backbone_and_transform(
+        model_cfg.vision_backbone_id,
+        model_cfg.image_resize_strategy,
+    )
+    # Load LLM Backbone --> note `inference_mode = True` by default when calling `load()`
+    overwatch.info(f"Loading Pretrained LLM [bold]{model_cfg.llm_backbone_id}[/] via HF Transformers")
+    llm_backbone, tokenizer = get_llm_backbone_and_tokenizer(
+        model_cfg.llm_backbone_id,
+        llm_max_length=model_cfg.llm_max_length,
+        hf_token=hf_token,
+        inference_mode=not load_for_training,
+    )
+    # Create Action Tokenizer
+    action_tokenizer = ActionTokenizer(llm_backbone.get_tokenizer())
+    # Load VLM using `from_pretrained` (clobbers HF syntax... eventually should reconcile)
+    overwatch.info(f"Loading VLA [bold blue]{model_cfg.model_id}[/] from Checkpoint")
+    vla = OpenVLA.from_pretrained(
+        checkpoint_pt,
+        model_cfg.model_id,
+        vision_backbone,
+        llm_backbone,
+        arch_specifier=model_cfg.arch_specifier,
+        freeze_weights=not load_for_training,
+        norm_stats=norm_stats,
+        action_tokenizer=action_tokenizer,
+    )
+    return vla

prismatic/models/materialize.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+materialize.py
+Factory class for initializing Vision Backbones, LLM Backbones, and VLMs from a set registry; provides and exports
+individual functions for clear control flow.
+"""
+from typing import Optional, Tuple
+from transformers import PreTrainedTokenizerBase
+from prismatic.models.backbones.llm import LLaMa2LLMBackbone, LLMBackbone, MistralLLMBackbone, PhiLLMBackbone
+from prismatic.models.backbones.vision import (
+    CLIPViTBackbone,
+    DinoCLIPViTBackbone,
+    DinoSigLIPViTBackbone,
+    DinoV2ViTBackbone,
+    ImageTransform,
+    IN1KViTBackbone,
+    SigLIPViTBackbone,
+    VisionBackbone,
+)
+from prismatic.models.vlms import PrismaticVLM
+# === Registries =>> Maps ID --> {cls(), kwargs} :: Different Registries for Vision Backbones, LLM Backbones, VLMs ===
+# fmt: off
+# === Vision Backbone Registry ===
+VISION_BACKBONES = {
+    # === 224px Backbones ===
+    "clip-vit-l": {"cls": CLIPViTBackbone, "kwargs": {"default_image_size": 224}},
+    "siglip-vit-so400m": {"cls": SigLIPViTBackbone, "kwargs": {"default_image_size": 224}},
+    "dinov2-vit-l": {"cls": DinoV2ViTBackbone, "kwargs": {"default_image_size": 224}},
+    "in1k-vit-l": {"cls": IN1KViTBackbone, "kwargs": {"default_image_size": 224}},
+    "dinosiglip-vit-so-224px": {"cls": DinoSigLIPViTBackbone, "kwargs": {"default_image_size": 224}},
+    # === Assorted CLIP Backbones ===
+    "clip-vit-b": {"cls": CLIPViTBackbone, "kwargs": {"default_image_size": 224}},
+    "clip-vit-l-336px": {"cls": CLIPViTBackbone, "kwargs": {"default_image_size": 336}},
+    # === Assorted SigLIP Backbones ===
+    "siglip-vit-b16-224px": {"cls": SigLIPViTBackbone, "kwargs": {"default_image_size": 224}},
+    "siglip-vit-b16-256px": {"cls": SigLIPViTBackbone, "kwargs": {"default_image_size": 256}},
+    "siglip-vit-b16-384px": {"cls": SigLIPViTBackbone, "kwargs": {"default_image_size": 384}},
+    "siglip-vit-so400m-384px": {"cls": SigLIPViTBackbone, "kwargs": {"default_image_size": 384}},
+    # === Fused Backbones ===
+    "dinoclip-vit-l-336px": {"cls": DinoCLIPViTBackbone, "kwargs": {"default_image_size": 336}},
+    "dinosiglip-vit-so-384px": {"cls": DinoSigLIPViTBackbone, "kwargs": {"default_image_size": 384}},
+}
+# === Language Model Registry ===
+LLM_BACKBONES = {
+    # === LLaMa-2 Pure (Non-Chat) Backbones ===
+    "llama2-7b-pure": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    "llama2-13b-pure": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    # === LLaMa-2 Chat Backbones ===
+    "llama2-7b-chat": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    "llama2-13b-chat": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    # === Vicuna-v1.5 Backbones ===
+    "vicuna-v15-7b": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    "vicuna-v15-13b": {"cls": LLaMa2LLMBackbone, "kwargs": {}},
+    # === Mistral v0.1 Backbones ===
+    "mistral-v0.1-7b-pure": {"cls": MistralLLMBackbone, "kwargs": {}},
+    "mistral-v0.1-7b-instruct": {"cls": MistralLLMBackbone, "kwargs": {}},
+    # === Phi-2 Backbone ===
+    "phi-2-3b": {"cls": PhiLLMBackbone, "kwargs": {}},
+}
+# fmt: on
+def get_vision_backbone_and_transform(
+    vision_backbone_id: str, image_resize_strategy: str
+) -> Tuple[VisionBackbone, ImageTransform]:
+    """Instantiate a Vision Backbone, returning both the nn.Module wrapper class and default Image Transform."""
+    if vision_backbone_id in VISION_BACKBONES:
+        vision_cfg = VISION_BACKBONES[vision_backbone_id]
+        vision_backbone: VisionBackbone = vision_cfg["cls"](
+            vision_backbone_id, image_resize_strategy, **vision_cfg["kwargs"]
+        )
+        image_transform = vision_backbone.get_image_transform()
+        return vision_backbone, image_transform
+    else:
+        raise ValueError(f"Vision Backbone `{vision_backbone_id}` is not supported!")
+def get_llm_backbone_and_tokenizer(
+    llm_backbone_id: str,
+    llm_max_length: int = 2048,
+    hf_token: Optional[str] = None,
+    inference_mode: bool = False,
+) -> Tuple[LLMBackbone, PreTrainedTokenizerBase]:
+    if llm_backbone_id in LLM_BACKBONES:
+        llm_cfg = LLM_BACKBONES[llm_backbone_id]
+        llm_backbone: LLMBackbone = llm_cfg["cls"](
+            llm_backbone_id,
+            llm_max_length=llm_max_length,
+            hf_token=hf_token,
+            inference_mode=inference_mode,
+            **llm_cfg["kwargs"],
+        )
+        tokenizer = llm_backbone.get_tokenizer()
+        return llm_backbone, tokenizer
+    else:
+        raise ValueError(f"LLM Backbone `{llm_backbone_id}` is not supported!")
+def get_vlm(
+    model_id: str,
+    arch_specifier: str,
+    vision_backbone: VisionBackbone,
+    llm_backbone: LLMBackbone,
+    enable_mixed_precision_training: bool = True,
+) -> PrismaticVLM:
+    """Lightweight wrapper around initializing a VLM, mostly for future-proofing (if one wants to add a new VLM)."""
+    return PrismaticVLM(
+        model_id,
+        vision_backbone,
+        llm_backbone,
+        enable_mixed_precision_training=enable_mixed_precision_training,
+        arch_specifier=arch_specifier,
+    )

prismatic/models/projectors.py ADDED Viewed

	@@ -0,0 +1,67 @@

+"""Implementation of additional projectors for additional inputs to the VLA models."""
+import torch
+import torch.nn as nn
+from einops import rearrange
+class ProprioProjector(nn.Module):
+    """
+    Projects proprio state inputs into the LLM's embedding space.
+    """
+    def __init__(self, llm_dim: int, proprio_dim: int) -> None:
+        super().__init__()
+        self.llm_dim = llm_dim
+        self.proprio_dim = proprio_dim
+        self.fc1 = nn.Linear(self.proprio_dim, self.llm_dim, bias=True)
+        self.fc2 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
+        self.act_fn1 = nn.GELU()
+    def forward(self, proprio: torch.Tensor = None) -> torch.Tensor:
+        # proprio: (bsz, proprio_dim)
+        projected_features = self.fc1(proprio)
+        projected_features = self.act_fn1(projected_features)
+        projected_features = self.fc2(projected_features)
+        return projected_features
+class NoisyActionProjector(nn.Module):
+    """
+    [Diffusion] Projects noisy action inputs into the LLM's embedding space.
+    Note that since each action is tokenized into 7 tokens in OpenVLA (rather
+    than having 1 token per action), each noisy action token will have dimension 1
+    instead of 7.
+    """
+    def __init__(self, llm_dim: int) -> None:
+        super().__init__()
+        self.llm_dim = llm_dim
+        self.action_token_dim = 1
+        self.fc1 = nn.Linear(self.action_token_dim, self.llm_dim, bias=True)
+        self.fc2 = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
+        self.act_fn1 = nn.GELU()
+    def forward(self, noisy_actions: torch.Tensor = None) -> torch.Tensor:
+        # noisy_actions: (bsz, num_action_tokens=chunk_len*action_dim, 1)
+        projected_features = self.fc1(noisy_actions)
+        projected_features = self.act_fn1(projected_features)
+        projected_features = self.fc2(projected_features)
+        return projected_features
+class VisualProjector(nn.Module):
+    def __init__(self, llm_dim: int, visual_dim: int) -> None:
+        super().__init__()
+        self.visual_dim, self.llm_dim = visual_dim, llm_dim
+        self.fc1     = nn.Linear(self.llm_dim, self.llm_dim, bias=True)
+        self.fc2     = nn.Linear(self.llm_dim, self.visual_dim, bias=True)
+        self.act_fn1 = nn.GELU()
+    def forward(self, img_hidden_embedding: torch.Tensor) -> torch.Tensor:
+        projected_features = self.fc1(img_hidden_embedding)
+        projected_features = self.act_fn1(projected_features)
+        projected_features = self.fc2(projected_features)
+        return projected_features

prismatic/preprocessing/datasets/datasets.py ADDED Viewed

	@@ -0,0 +1,200 @@

+"""
+datasets.py
+PyTorch Dataset Definitions for Prismatic models; supports processing for both the `align` and `finetune` stages, with
+utilities for formatting conversations during the `finetune` stage subject to the given LLM backbone's expected
+formatting (e.g., SYS_PROMPT + USER: ... ASSISTANT: ... for Vicuña v1.5 Chat models).
+We currently only support Map-style Datasets; assumes that all files (annotations, images) are on local disk, and that
+random access image reading is relatively cheap/fast.
+"""
+import copy
+import json
+from pathlib import Path
+from typing import Dict, List, Tuple, Type
+import torch
+from PIL import Image
+from torch.utils.data import Dataset
+from transformers import CodeGenTokenizerFast, LlamaTokenizerFast, PreTrainedTokenizerBase
+from prismatic.models.backbones.llm.prompting import PromptBuilder
+from prismatic.models.backbones.vision import ImageTransform
+# HuggingFace Default / LLaMa-2 IGNORE_INDEX (for labels)
+IGNORE_INDEX = -100
+class AlignDataset(Dataset[Dict[str, torch.Tensor]]):
+    def __init__(
+        self,
+        chat_json: Path,
+        image_dir: Path,
+        image_transform: ImageTransform,
+        tokenizer: PreTrainedTokenizerBase,
+    ) -> None:
+        super().__init__()
+        self.chat_json, self.image_dir = chat_json, image_dir
+        self.image_transform, self.tokenizer = image_transform, tokenizer
+        self.dataset_type = "align"
+        # Create Prompt Template
+        self.prompt_template = "{caption}" + self.tokenizer.eos_token
+        # Load Chat JSON
+        with open(self.chat_json, "r") as f:
+            self.examples = json.load(f)
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """
+        Following the *actual* code executed from the LLaVa codebase, during the "align" phase, we actually discard
+        the "prompt" from the human, and instead directly predict the caption from the image.
+        As a concrete example given the "raw data" for the first example:
+            example = self.examples[0]["conversations"]` = {
+                [
+                    {"from": "human", "value": "Render a clear and concise summary of the photo.\n<image>"},
+                    {"from": "gpt", "value": "select luxury furniture 3 - inch gel memory foam mattress topper"}
+                ]
+            }
+        Return =>> self.tokenizer("<image> select luxury furniture 3 - inch gel memory foam mattress topper\n")
+        :param idx: Index to retrieve from the dataset.
+        :return: Dictionary of {"pixel_values": torch.Tensor, "input_ids": torch.Tensor, "labels": torch.Tensor}
+        """
+        image_path, conversation = Path(self.examples[idx]["image"]), self.examples[idx]["conversations"]
+        assert (len(conversation) == 2) and ("<image>" not in conversation[-1]["value"]), "Unexpected text!"
+        # Format Caption --> {caption}{eos_token}
+        caption = self.prompt_template.format(caption=conversation[-1]["value"].strip())
+        # We treat image patches as "tokens = [p1 p2 p3, ...]"; we need to specify ordering of text/patch tokens.
+        #   => Critically, we find that inserting *after* the BOS token leads to the strongest performance!
+        #       - input_ids = "<s> p1 p2 p3 ... <caption_text> \n"
+        #       - labels = "IGNORE IGNORE ..." (copy `input_ids` replacing <s> and p{1...K} with IGNORE)
+        #
+        # IMPORTANT => IF WE'RE USING HF LLM.forward(... labels=labels), SHIFTING HAPPENS _INSIDE_ MODEL!
+        input_ids = self.tokenizer(caption, truncation=True, return_tensors="pt").input_ids[0]
+        labels = copy.deepcopy(input_ids)
+        # Set the <BOS> token's label to IGNORE_INDEX (since we're inserting the image patches right after)
+        labels[0] = IGNORE_INDEX
+        # Process Image --> get "pixel_values" (will either be a torch.Tensor OR a Dict[str,torch.Tensor])
+        pixel_values = self.image_transform(Image.open(self.image_dir / image_path).convert("RGB"))
+        return dict(pixel_values=pixel_values, input_ids=input_ids, labels=labels)
+    def get_modality_lengths(self, n_image_patches: int) -> List[Tuple[bool, int]]:
+        """Get a list of modalities (unimodal / text-only vs. multimodal) and length of conversations per example."""
+        modality_lengths = []
+        for example in self.examples:
+            is_multimodal = "image" in example
+            n_words = sum([len(turn["value"].replace("<image>", "").split()) for turn in example["conversations"]])
+            modality_lengths.append((is_multimodal, (n_image_patches + n_words) if is_multimodal else n_words))
+        return modality_lengths
+    def __len__(self) -> int:
+        return len(self.examples)
+class FinetuneDataset(Dataset[Dict[str, torch.Tensor]]):
+    def __init__(
+        self,
+        instruct_json: Path,
+        image_dir: Path,
+        image_transform: ImageTransform,
+        tokenizer: PreTrainedTokenizerBase,
+        prompt_builder_fn: Type[PromptBuilder],
+    ) -> None:
+        super().__init__()
+        self.instruct_json, self.image_dir = instruct_json, image_dir
+        self.image_transform, self.tokenizer = image_transform, tokenizer
+        self.prompt_builder_fn = prompt_builder_fn
+        self.dataset_type = "finetune"
+        # Load Instruct JSON
+        with open(self.instruct_json, "r") as f:
+            self.examples = json.load(f)
+    # === Unimodal + Multimodal Handling ===
+    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
+        """
+        Unlike the *align* stage handling, for the *finetune* stage, we actually need to handle multiple "turns" of
+        dialog grounded in a single image.
+        To do this, we leverage the `prompt_builder_fn` which instantiates a PromptBuilder object. By calling the
+        methods for adding turns and getting a prompt, we ensure proper formatting and consistency for each example.
+        :param idx: Index to retrieve from the dataset.
+        :return: Dictionary of {"pixel_values": torch.Tensor, "input_ids": torch.Tensor, "labels": torch.Tensor}
+        """
+        conversation = self.examples[idx]["conversations"]
+        # Create Prompt Builder --> add each message sequentially
+        prompt_builder, input_ids, labels = self.prompt_builder_fn(model_family="prismatic"), [], []
+        for turn_idx, turn in enumerate(conversation):
+            # Get "effective" string added to prompt --> handle whitespace for tokenizer type!
+            msg = prompt_builder.add_turn(turn["from"], turn["value"])
+            # Llama Tokenizer (Fast) adds extra character if a string ends in whitespace --> strip if non-empty!
+            if isinstance(self.tokenizer, LlamaTokenizerFast):
+                msg = msg.rstrip()
+            # Phi-2 Tokenizer == CodeGenTokenizer (Fast) -- no special handling!
+            elif isinstance(self.tokenizer, CodeGenTokenizerFast):
+                pass
+            else:
+                raise ValueError(f"Tokenizer of type `{type(self.tokenizer)}` is not explicitly handled!")
+            # Tokenize Input IDs
+            turn_input_ids = self.tokenizer(msg, add_special_tokens=turn_idx == 0).input_ids
+            # [CRITICAL] We do not want to take the loss for the "USER: <msg>" prompts =>> just the responses!
+            turn_labels = (
+                [IGNORE_INDEX for _ in range(len(turn_input_ids))] if (turn_idx % 2) == 0 else list(turn_input_ids)
+            )
+            # Add to Trackers
+            input_ids.extend(turn_input_ids)
+            labels.extend(turn_labels)
+        # Tensorize =>> Set the <BOS> token's label to IGNORE_INDEX (since we're inserting the image patches after)
+        #   - IMPORTANT => IF WE'RE USING HF LLM.forward(... labels=labels), SHIFTING HAPPENS _INSIDE_ MODEL!
+        input_ids, labels = torch.tensor(input_ids), torch.tensor(labels)
+        # Handle Truncation (if necessary)
+        input_ids, labels = input_ids[: self.tokenizer.model_max_length], labels[: self.tokenizer.model_max_length]
+        # === Handle "unimodal" (language-only) vs. "multimodal" ===
+        if "image" in self.examples[idx]:
+            image_path = Path(self.examples[idx]["image"])
+            # Set the <BOS> token's label to IGNORE_INDEX (since we're inserting the image patches right after)
+            labels[0] = IGNORE_INDEX
+            # Process Image --> get "pixel_values" (will either be a torch.Tensor OR a Dict[str,torch.Tensor])
+            pixel_values = self.image_transform(Image.open(self.image_dir / image_path).convert("RGB"))
+            return dict(pixel_values=pixel_values, input_ids=input_ids, labels=labels)
+        else:
+            # No image --> return `pixel_values` = None; Collator will do the smart batch handling for us!
+            return dict(pixel_values=None, input_ids=input_ids, labels=labels)
+    def get_modality_lengths(self) -> List[Tuple[bool, int]]:
+        """Get a list of modalities (unimodal / text-only vs. multimodal) and length of conversations per example."""
+        modality_lengths = []
+        for example in self.examples:
+            is_multimodal = "image" in example
+            n_words = sum([len(turn["value"].split()) for turn in example["conversations"]])
+            modality_lengths.append((is_multimodal, n_words))
+        return modality_lengths
+    def __len__(self) -> int:
+        return len(self.examples)

prismatic/preprocessing/materialize.py ADDED Viewed

	@@ -0,0 +1,69 @@

+"""
+materialize.py
+Factory class for initializing pretraining datasets on a per-VLM basis; provides and exports individual functions for
+clear control flow.
+"""
+from typing import Tuple, Type
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizerBase
+from prismatic.conf import DatasetConfig
+from prismatic.models.backbones.llm.prompting import PromptBuilder
+from prismatic.models.backbones.vision import ImageTransform
+from prismatic.preprocessing.datasets import AlignDataset, FinetuneDataset
+from prismatic.util.data_utils import PaddedCollatorForLanguageModeling
+# Dataset Initializers =>> Maps Stage --> cls()
+DATASET_INITIALIZER = {"align": AlignDataset, "finetune": FinetuneDataset, "full-finetune": FinetuneDataset}
+def get_dataset_and_collator(
+    stage: str,
+    dataset_cfg: DatasetConfig,
+    image_transform: ImageTransform,
+    tokenizer: PreTrainedTokenizerBase,
+    prompt_builder_fn: Type[PromptBuilder],
+    default_image_resolution: Tuple[int, int, int],
+    padding_side: str = "right",
+) -> Tuple[Dataset, PaddedCollatorForLanguageModeling]:
+    dataset_cls = DATASET_INITIALIZER[stage]
+    dataset_root_dir = dataset_cfg.dataset_root_dir
+    collator = PaddedCollatorForLanguageModeling(
+        tokenizer.model_max_length, tokenizer.pad_token_id, default_image_resolution, padding_side=padding_side
+    )
+    # Switch on `stage`
+    if stage == "align":
+        annotation_json, image_dir = dataset_cfg.align_stage_components
+        dataset = dataset_cls(
+            dataset_root_dir / annotation_json, dataset_root_dir / image_dir, image_transform, tokenizer
+        )
+        return dataset, collator
+    elif stage == "finetune":
+        annotation_json, image_dir = dataset_cfg.finetune_stage_components
+        dataset = dataset_cls(
+            dataset_root_dir / annotation_json,
+            dataset_root_dir / image_dir,
+            image_transform,
+            tokenizer,
+            prompt_builder_fn=prompt_builder_fn,
+        )
+        return dataset, collator
+    elif stage == "full-finetune":
+        annotation_json, image_dir = dataset_cfg.finetune_stage_components
+        dataset = dataset_cls(
+            dataset_root_dir / annotation_json,
+            dataset_root_dir / image_dir,
+            image_transform,
+            tokenizer,
+            prompt_builder_fn=prompt_builder_fn,
+        )
+        return dataset, collator
+    else:
+        raise ValueError(f"Stage `{stage}` is not supported!")

prismatic/py.typed ADDED Viewed

File without changes

prismatic/util/nn_utils.py ADDED Viewed

	@@ -0,0 +1,53 @@

+"""
+nn_utils.py
+Utility functions and PyTorch submodule definitions.
+"""
+import torch
+import torch.nn as nn
+# === Definitions for Various Projection Modules, with Signature :: [..., in_dim] --> [..., out_dim] ===
+class LinearProjector(nn.Module):
+    def __init__(self, vision_dim: int, llm_dim: int) -> None:
+        super().__init__()
+        self.projector = nn.Linear(vision_dim, llm_dim, bias=True)
+    def forward(self, img_patches: torch.Tensor) -> torch.Tensor:
+        return self.projector(img_patches)
+class MLPProjector(nn.Module):
+    def __init__(self, vision_dim: int, llm_dim: int, mlp_type: str = "gelu-mlp") -> None:
+        super().__init__()
+        if mlp_type == "gelu-mlp":
+            self.projector = nn.Sequential(
+                nn.Linear(vision_dim, llm_dim, bias=True),
+                nn.GELU(),
+                nn.Linear(llm_dim, llm_dim, bias=True),
+            )
+        else:
+            raise ValueError(f"Projector with `{mlp_type = }` is not supported!")
+    def forward(self, img_patches: torch.Tensor) -> torch.Tensor:
+        return self.projector(img_patches)
+class FusedMLPProjector(nn.Module):
+    def __init__(self, fused_vision_dim: int, llm_dim: int, mlp_type: str = "fused-gelu-mlp") -> None:
+        super().__init__()
+        self.initial_projection_dim = fused_vision_dim * 4
+        if mlp_type == "fused-gelu-mlp":
+            self.projector = nn.Sequential(
+                nn.Linear(fused_vision_dim, self.initial_projection_dim, bias=True),
+                nn.GELU(),
+                nn.Linear(self.initial_projection_dim, llm_dim, bias=True),
+                nn.GELU(),
+                nn.Linear(llm_dim, llm_dim, bias=True),
+            )
+        else:
+            raise ValueError(f"Fused Projector with `{mlp_type = }` is not supported!")
+    def forward(self, fused_img_patches: torch.Tensor) -> torch.Tensor:
+        return self.projector(fused_img_patches)

prismatic/vla/datasets/rlds/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .dataset import make_interleaved_dataset, make_single_dataset

prismatic/vla/datasets/rlds/dataset.py ADDED Viewed

	@@ -0,0 +1,655 @@

+"""
+dataset.py
+Core interface script for configuring and initializing RLDS datasets.
+"""
+import copy
+import inspect
+import json
+import random  # 导入random模块
+from functools import partial
+from typing import Callable, Dict, List, Optional, Tuple, Union
+import dlimp as dl
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+from prismatic.overwatch import initialize_overwatch
+from prismatic.vla.constants import ACTION_DIM, ACTION_PROPRIO_NORMALIZATION_TYPE, ACTION_TOKEN_BEGIN_IDX, IGNORE_INDEX, NUM_ACTIONS_CHUNK, PROPRIO_DIM, STOP_INDEX
+from prismatic.vla.datasets.rlds import obs_transforms, traj_transforms
+from prismatic.vla.datasets.rlds.utils import goal_relabeling, task_augmentation
+from prismatic.vla.datasets.rlds.utils.data_utils import (
+    allocate_threads,
+    get_dataset_statistics,
+    normalize_action_and_proprio,
+    pprint_data_mixture,
+    tree_map,
+    shuffle_dataset,  # 新增导入shuffle_dataset函数
+)
+# Initialize Overwatch =>> Wraps `logging.Logger`
+overwatch = initialize_overwatch(__name__)
+# # Adds a function to set all random seeds
+# def set_all_seeds(seed):
+#     """Set the seeds of all random number generators to ensure reproducibility."""
+#     random.seed(seed)
+#     np.random.seed(seed)
+#     tf.random.set_seed(seed)
+#     # Enable TensorFlow deterministic operations (if supported by the TensorFlow version)
+#     try:
+#         tf.config.experimental.enable_op_determinism()
+#     except AttributeError:
+#         overwatch.warning("The TensorFlow version does not support enable_op_determinism, and the results may not be fully reproducible.")
+# Configure Tensorflow with *no GPU devices* (to prevent clobber with PyTorch)
+tf.config.set_visible_devices([], "GPU")
+# # Try to get seeds from environment variables or global Settings and set them
+# try:
+#     from prismatic.training.train_utils import get_global_seed
+#     seed = get_global_seed()
+#     if seed is not None:
+#         set_all_seeds(seed)
+#         overwatch.info(f"The Dataset module has been set with a random seed: {seed}")
+# except (ImportError, NameError):
+#     overwatch.warning("The global seed setting cannot be obtained, so the data processing may not be fully reproducible.")
+# ruff: noqa: B006
+def make_dataset_from_rlds(
+    name: str,
+    data_dir: str,
+    *,
+    train: bool,
+    shuffle_seed: int,
+    standardize_fn: Optional[Callable[[dict], dict]] = None,
+    shuffle: bool = True,
+    image_obs_keys: Dict[str, Optional[str]] = {},
+    depth_obs_keys: Dict[str, Optional[str]] = {},
+    state_obs_keys: List[Optional[str]] = (),
+    language_key: Optional[str] = None,
+    action_proprio_normalization_type: ACTION_PROPRIO_NORMALIZATION_TYPE,
+    dataset_statistics: Optional[Union[dict, str]] = None,
+    absolute_action_mask: Optional[List[bool]] = None,
+    action_normalization_mask: Optional[List[bool]] = None,
+    num_parallel_reads: int = tf.data.AUTOTUNE,
+    num_parallel_calls: int = tf.data.AUTOTUNE,
+) -> Tuple[dl.DLataset, dict]:
+    """
+    This function is responsible for loading a specific RLDS dataset from storage and getting it into a standardized
+    format. Yields a dataset of trajectories. Does not include CPU-intensive operations.
+    If `standardize_fn` is provided, it will be applied to each trajectory. This function should get the trajectory
+    into a standard format, which includes the keys "observation" and "action". Entry "observation" should be a
+    dictionary containing some number of additional keys, which will be extracted into an even more standardized format
+    according to the "*_obs_keys" arguments.
+    The `image_obs_keys` and `depth_obs_keys` arguments are mappings from new names to old names, or None in place of an
+    old name to insert padding. For example, if after `standardize_fn`, your "observation" dict has RGB images called
+    "workspace" and "wrist", and `image_obs_keys={"primary": "workspace", "secondary": None, "wrist": "wrist"}`, then
+    the resulting dataset will have an "observation" dict containing the keys "image_primary", "image_secondary", and
+    "image_wrist", where "image_primary" corresponds to "workspace", "image_secondary" is a padding image, and
+    "image_wrist" corresponds to "wrist".
+    Entry `state_obs_keys` is a list of 1-dimensional proprioceptive keys to concatenate into a single array, which will
+    be placed in the "proprio" key of the "observation" dict. A single padding element (zero) will be inserted for each
+    None entry.
+    The dataset will also include a "task" dict. If `language_key` is provided, then the "task" dict will contain the
+    key "language_instruction", extracted from `traj[language_key]`.
+    Args:
+        name (str): The name of the RLDS dataset (usually "name" or "name:version").
+        data_dir (str): The path to the data directory.
+        train (bool): Whether to use the training or validation split.
+        shuffle (bool, optional): Whether to shuffle the file read order (does NOT fully shuffle the dataset, since one
+            file usually contains many trajectories)!
+        standardize_fn (Callable[[dict], dict], optional): A function that, if provided, will be the first
+            thing applied to each trajectory.
+        image_obs_keys (Mapping[str, str|None]): Mapping from {new: old} indicating which RGB images to extract from the
+            "observation" dict. `new_obs = {f"image_{new}": old_obs[old] for new, old in image_obs_keys.items()}`.
+            If a value of `old` is None, inserts a padding image instead (empty string).
+        depth_obs_keys (Mapping[str, str|None]): Same as `image_obs_keys`, but for depth images. Keys will be
+            prefixed with "depth_" instead of "image_".
+        state_obs_keys (Sequence[str|None]): List of 1-dimensional proprioception keys to be extracted from the
+            "observation" dict, concatenated, and mapped to "proprio". Inserts 1 element of padding for each None entry.
+        language_key (str, optional): If provided, the "task" dict will contain the key "language_instruction",
+            extracted from `traj[language_key]`.
+        action_proprio_normalization_type (str, optional): The type of normalization to perform on the action,
+            proprio, or both. Can be "normal" (mean 0, std 1) or "bounds" (normalized to [-1, 1]).
+        dataset_statistics: (dict|str, optional): dict (or path to JSON file) that contains dataset statistics
+            for normalization. If `action_proprio_normalization_type` is "normal", this should contain "mean" and
+            "std" keys. If `action_proprio_normalization_type` is "bounds", this should contain "min" and "max"
+            keys. May also provide "num_transitions" and "num_trajectories" keys for downstream usage (e.g., for
+            `make_interleaved_dataset`). If not provided, the statistics will be computed on the fly.
+        absolute_action_mask (Sequence[bool], optional): By default, all action dimensions are assumed to be
+            relative. This is important for when `future_action_window_size > 0`: actions that are taken
+            from beyond the end of the trajectory (or beyond the goal timestep when goal relabeling is used)
+            need to be made "neutral" to indicate that the task has been completed. For relative actions,
+            "neutral" means zero, but for absolute actions, "neutral" means repeating the last valid action.
+            This mask, if provided, indicates which action dimensions are absolute.
+        action_normalization_mask (Sequence[bool], optional): If provided, indicates which action dimensions
+            should be normalized. For example, you might not want to normalize the gripper action dimension if
+            it's always exactly 0 or 1. By default, all action dimensions are normalized.
+        num_parallel_reads (int): number of parallel read workers. Default to AUTOTUNE.
+        num_parallel_calls (int): number of parallel calls for traj_map operations. Default to AUTOTUNE.
+    Returns:
+        Dataset of trajectories where each step has the following fields:
+        - observation:
+            - image_{name1, name2, ...} # RGB image observations
+            - depth_{name1, name2, ...} # depth image observations
+            - proprio                   # 1-dimensional array of proprioceptive observations
+            - timestep                  # timestep of each frame
+        - task:
+            - language_instruction      # language instruction, present if `language_key` is provided
+        - action                        # action vector
+        - dataset_name                  # name of the dataset
+    """
+    REQUIRED_KEYS = {"observation", "action"}
+    if language_key is not None:
+        REQUIRED_KEYS.add(language_key)
+    def restructure(traj):
+        # apply a standardization function, if provided
+        if standardize_fn is not None:
+            traj = standardize_fn(traj)
+        if not all(k in traj for k in REQUIRED_KEYS):
+            raise ValueError(
+                f"Trajectory is missing keys: {REQUIRED_KEYS - set(traj.keys())}. " "Did you write a `standardize_fn`?"
+            )
+        # extracts images, depth images and proprio from the "observation" dict
+        traj_len = tf.shape(traj["action"])[0]
+        old_obs = traj["observation"]
+        new_obs = {}
+        for new, old in image_obs_keys.items():
+            if old is None:
+                new_obs[f"image_{new}"] = tf.repeat("", traj_len)  # padding
+            else:
+                new_obs[f"image_{new}"] = old_obs[old]
+        for new, old in depth_obs_keys.items():
+            if old is None:
+                new_obs[f"depth_{new}"] = tf.repeat("", traj_len)  # padding
+            else:
+                new_obs[f"depth_{new}"] = old_obs[old]
+        if state_obs_keys:
+            new_obs["proprio"] = tf.concat(
+                [
+                    (
+                        tf.zeros((traj_len, 1), dtype=tf.float32)  # padding
+                        if key is None
+                        else tf.cast(old_obs[key], tf.float32)
+                    )
+                    for key in state_obs_keys
+                ],
+                axis=1,
+            )
+        # add timestep info
+        new_obs["timestep"] = tf.range(traj_len)
+        # extracts `language_key` into the "task" dict
+        task = {}
+        if language_key is not None:
+            if traj[language_key].dtype != tf.string:
+                raise ValueError(
+                    f"Language key {language_key} has dtype {traj[language_key].dtype}, " "but it must be tf.string."
+                )
+            task["language_instruction"] = traj.pop(language_key)
+        traj = {
+            "observation": new_obs,
+            "task": task,
+            "action": tf.cast(traj["action"], tf.float32),
+            "dataset_name": tf.repeat(name, traj_len),
+        }
+        if absolute_action_mask is not None:
+            if len(absolute_action_mask) != traj["action"].shape[-1]:
+                raise ValueError(
+                    f"Length of absolute_action_mask ({len(absolute_action_mask)}) "
+                    f"does not match action dimension ({traj['action'].shape[-1]})."
+                )
+            traj["absolute_action_mask"] = tf.tile(
+                tf.convert_to_tensor(absolute_action_mask, dtype=tf.bool)[None],
+                [traj_len, 1],
+            )
+        return traj
+    builder = tfds.builder(name, data_dir=data_dir)
+    # load or compute dataset statistics
+    if isinstance(dataset_statistics, str):
+        with tf.io.gfile.GFile(dataset_statistics, "r") as f:
+            dataset_statistics = json.load(f)
+    elif dataset_statistics is None:
+        full_dataset = dl.DLataset.from_rlds(
+            builder, split="all", shuffle=False, num_parallel_reads=num_parallel_reads
+        ).traj_map(restructure, num_parallel_calls)
+        # tries to load from cache, otherwise computes on the fly
+        dataset_statistics = get_dataset_statistics(
+            full_dataset,
+            hash_dependencies=(
+                str(builder.info),
+                str(state_obs_keys),
+                inspect.getsource(standardize_fn) if standardize_fn is not None else "",
+            ),
+            save_dir=builder.data_dir,
+        )
+    dataset_statistics = tree_map(np.array, dataset_statistics)
+    # skip normalization for certain action dimensions
+    if action_normalization_mask is not None:
+        if len(action_normalization_mask) != dataset_statistics["action"]["mean"].shape[-1]:
+            raise ValueError(
+                f"Length of skip_normalization_mask ({len(action_normalization_mask)}) "
+                f"does not match action dimension ({dataset_statistics['action']['mean'].shape[-1]})."
+            )
+        dataset_statistics["action"]["mask"] = np.array(action_normalization_mask)
+    # construct the dataset
+    split = "train" if train else "val"
+    dataset = dl.DLataset.from_rlds(builder, split=split, shuffle=shuffle, num_parallel_reads=num_parallel_reads, shuffle_seed=shuffle_seed)
+    dataset = dataset.traj_map(restructure, num_parallel_calls)
+    dataset = dataset.traj_map(
+        partial(
+            normalize_action_and_proprio,
+            metadata=dataset_statistics,
+            normalization_type=action_proprio_normalization_type,
+        ),
+        num_parallel_calls,
+    )
+    return dataset, dataset_statistics
+def apply_trajectory_transforms(
+    dataset: dl.DLataset,
+    *,
+    train: bool,
+    goal_relabeling_strategy: Optional[str] = None,
+    goal_relabeling_kwargs: dict = {},
+    window_size: int = 1,
+    future_action_window_size: int = 0,
+    subsample_length: Optional[int] = None,
+    skip_unlabeled: bool = False,
+    max_action: Optional[float] = None,
+    max_proprio: Optional[float] = None,
+    task_augment_strategy: Optional[str] = None,
+    task_augment_kwargs: dict = {},
+    num_parallel_calls: int = tf.data.AUTOTUNE,
+    use_predict_future_prop: bool = False,
+) -> dl.DLataset:
+    """
+    Applies common transforms that happen at a trajectory level. Such transforms are usually some sort of "relabeling"
+    (e.g., filtering, chunking, adding goals, dropping keys).
+    Transforms in this function should have the following properties:
+        - They require access to an entire trajectory (i.e., they cannot be applied frame-wise).
+        - They are generally not CPU-intensive, mostly involving moving and copying data.
+        - They do not require decoded images.
+    Args:
+        dataset (dl.DLataset): The dataset to transform.
+        train (bool): Whether the dataset is for training (affects subsampling).
+        goal_relabeling_strategy (str, optional): The goal relabeling strategy to use, or None for
+            no goal relabeling. See `goal_relabeling.py`.
+        goal_relabeling_kwargs (dict, optional): Additional keyword arguments to pass to the goal relabeling function.
+        window_size (int, optional): The length of the snippets that trajectories are chunked into.
+        future_action_window_size (int, optional): The number of future actions beyond window_size to include
+            in the chunked actions.
+        subsample_length (int, optional): If provided, trajectories longer than this will be subsampled to
+            this length (after goal relabeling and chunking).
+        skip_unlabeled (bool, optional): Whether to skip trajectories with no language labels.
+        max_action: (float, optional): If provided, trajectories in which *any* action dimension
+            of *any* transition has an absolute value larger than this will be skipped.
+        max_proprio: (float, optional): If provided, trajectories in which *any* proprio dimension
+            of *any* transition has an absolute value larger than this will be skipped.
+        task_augment_strategy (str, optional): The task augmentation strategy to use, or None for no task
+            augmentation. See `task_augmentation.py`.
+        task_augment_kwargs (dict, optional): Additional keyword arguments to pass to the task augmentation
+            function.
+        num_parallel_calls (int, optional): number of parallel calls for map operations. Default to AUTOTUNE.
+    """
+    if skip_unlabeled:
+        if "language_instruction" not in dataset.element_spec["task"]:
+            raise ValueError("skip_unlabeled=True but dataset does not have language labels.")
+        dataset = dataset.filter(lambda x: tf.math.reduce_any(x["task"]["language_instruction"] != ""))
+    if max_action is not None:
+        dataset = dataset.filter(lambda x: tf.math.reduce_all(tf.math.abs(x["action"]) <= max_action))
+    if max_proprio is not None and "proprio" in dataset.element_spec["observation"]:
+        dataset = dataset.filter(lambda x: tf.math.reduce_all(tf.math.abs(x["observation"]["proprio"]) <= max_proprio))
+    # Filter out trajectories that are too short for action chunking
+    # Required minimum length: window_size + future_action_window_size
+    # required_min_length = window_size + future_action_window_size
+    # if required_min_length > 1:
+    #     overwatch.info(f"Filtering trajectories shorter than {required_min_length} steps for action chunking (window_size={window_size}, future_action_window_size={future_action_window_size})")
+    #     # Quick statistics: sample a subset of data to estimate filtering ratio
+    #     try:
+    #         sample_size = 1000  # Number of samples
+    #         before_sample = dataset.take(sample_size)
+    #         # Count total and valid trajectories in the sample
+    #         total_sampled = 0
+    #         valid_sampled = 0
+    #         for item in before_sample:
+    #             total_sampled += 1
+    #             traj_length = tf.shape(item["action"])[0].numpy()
+    #             if traj_length >= required_min_length:
+    #                 valid_sampled += 1
+    #         if total_sampled > 0:
+    #             filter_ratio = valid_sampled / total_sampled
+    #             filtered_ratio = (total_sampled - valid_sampled) / total_sampled
+    #             overwatch.info(f"Sample statistics ({sample_size} trajectories): keep rate {filter_ratio:.2%}, filter rate {filtered_ratio:.2%}")
+    #             overwatch.info(f"Estimated ~{filtered_ratio:.1%} of trajectories will be filtered due to insufficient length")
+    #         else:
+    #             overwatch.info("Unable to obtain sample data for statistics")
+    #     except Exception as e:
+    #         overwatch.warning(f"Error during quick statistics: {e}, continuing with filtering operation")
+        # Execute the actual filtering operation
+        # dataset = dataset.filter(lambda x: tf.shape(x["action"])[0] >= required_min_length)
+        # overwatch.info("Trajectory length filtering completed")
+    # marks which entires of the observation and task dicts are padding
+    dataset = dataset.traj_map(traj_transforms.add_pad_mask_dict, num_parallel_calls)
+    # updates the "task" dict
+    if goal_relabeling_strategy is not None:
+        dataset = dataset.traj_map(
+            partial(getattr(goal_relabeling, goal_relabeling_strategy), **goal_relabeling_kwargs),
+            num_parallel_calls,
+        )
+    # must run task augmentation before chunking, in case it changes goal timesteps
+    if train and task_augment_strategy is not None:
+        # perform task augmentation (e.g., dropping keys)
+        dataset = dataset.traj_map(
+            partial(
+                getattr(task_augmentation, task_augment_strategy),
+                **task_augment_kwargs,
+            ),
+            num_parallel_calls,
+        )
+    # chunks observations and actions, giving them a new axis at index 1 of size `window_size` and
+    # `window_size + future_action_window_size`, respectively
+    if use_predict_future_prop:
+            traj_transforms_strategy = traj_transforms.chunk_act_future_obs
+    else:
+        traj_transforms_strategy = traj_transforms.chunk_act_obs
+    dataset = dataset.traj_map(
+        partial(
+            traj_transforms_strategy,
+            window_size=window_size,
+            future_action_window_size=future_action_window_size,
+        ),
+        num_parallel_calls,
+    )
+    if train and subsample_length is not None:
+        dataset = dataset.traj_map(
+            partial(traj_transforms.subsample, subsample_length=subsample_length),
+            num_parallel_calls,
+        )
+    return dataset
+def apply_per_dataset_frame_transforms(
+    dataset: dl.DLataset,
+    chunk_filter_fn: Optional[Callable] = None,
+):
+    """
+    Optionally applied *per-dataset* transforms that happen at a frame level.
+    Args:
+        chunk_filter_fn (callable, optional): Filter function for chunks.
+    """
+    if chunk_filter_fn:
+        dataset = dataset.filter(chunk_filter_fn)
+    return dataset
+def apply_frame_transforms(
+    dataset: dl.DLataset,
+    *,
+    train: bool,
+    image_augment_kwargs: Union[Dict, Dict[str, Dict]] = {},
+    resize_size: Union[Tuple[int, int], Dict[str, Tuple[int, int]]] = {},
+    depth_resize_size: Union[Tuple[int, int], Dict[str, Tuple[int, int]]] = {},
+    num_parallel_calls: int = tf.data.AUTOTUNE,
+) -> dl.DLataset:
+    """
+    Applies common transforms that happen at a frame level. These transforms are usually more CPU-intensive, (e.g.,
+    decoding or resizing images).
+    Args:
+        train (bool): Whether the dataset is for training (affects image augmentation).
+        dataset (dl.DLataset): The dataset to transform.
+        image_augment_kwargs (dict|Mapping[str, dict]): Keyword arguments to pass to the image augmentation
+            function. See `dlimp.transforms.augment_image` for documentation of these kwargs. If a dict of
+            dicts is provided, then key "k" will be used for "image_{k}" (names determined by `image_obs_keys`
+            in `make_dataset_from_rlds`). Augmentation will be skipped for missing keys (so pass an empty dict
+            to skip augmentation for all images).
+        resize_size (Tuple[int, int]|Mapping[str, Tuple[int, int]]): If provided, images will be resized to
+            this size. If a dict of tuples is provided, then key "k" will be used for "image_{k}" (names
+            determined by `image_obs_keys` in `make_dataset_from_rlds`). Resizing will be skipped for missing
+            keys (so pass an empty dict to skip resizing for all images).
+        depth_resize_size (Tuple[int, int]|Mapping[str, Tuple[int, int]]): Same as resize_size, but for depth
+            images.
+        num_parallel_calls (int): number of parallel calls for frame_map operations. Default to AUTOTUNE.
+    """
+    # Convenience wrapper that takes a function that operates on a non-chunked "observation" dict and applies
+    # it to the chunked "observation" dict as well as the non-chunked "task" dict
+    def apply_obs_transform(fn: Callable[[Dict], Dict], frame: Dict) -> Dict:
+        frame["task"] = fn(frame["task"])
+        frame["observation"] = dl.vmap(fn)(frame["observation"])
+        return frame
+    # Decode + resize images (and depth images)
+    dataset = dataset.frame_map(
+        partial(
+            apply_obs_transform,
+            partial(obs_transforms.decode_and_resize, resize_size=resize_size, depth_resize_size=depth_resize_size),
+        ),
+        num_parallel_calls,
+    )
+    if train:
+        # Augment all images with the same seed, skipping padding images
+        def aug(frame: dict):
+            seed = tf.random.uniform([2], maxval=tf.dtypes.int32.max, dtype=tf.int32)
+            aug_fn = partial(obs_transforms.augment, seed=seed, augment_kwargs=image_augment_kwargs)
+            return apply_obs_transform(aug_fn, frame)
+        dataset = dataset.frame_map(aug, num_parallel_calls)
+    return dataset
+def make_single_dataset(
+    dataset_kwargs: dict,
+    *,
+    train: bool,
+    traj_transform_kwargs: dict = {},
+    frame_transform_kwargs: dict = {},
+) -> dl.DLataset:
+    """Creates a single dataset from kwargs. Returns a dataset of trajectories.
+    Args:
+        dataset_kwargs: kwargs passed to `make_dataset_from_rlds` that are dataset-specific.
+        train: whether this is a training or validation dataset.
+        traj_transform_kwargs: kwargs passed to 'apply_trajectory_transforms'.
+        frame_transform_kwargs: kwargs passed to 'get_frame_transforms'.
+    """
+    dataset, dataset_statistics = make_dataset_from_rlds(
+        **dataset_kwargs,
+        train=train,
+    )
+    dataset = apply_trajectory_transforms(dataset, **traj_transform_kwargs, train=train)
+    dataset = apply_frame_transforms(dataset, **frame_transform_kwargs, train=train)
+    # this seems to reduce memory usage without affecting speed
+    dataset = dataset.with_ram_budget(1)
+    # save for later
+    return dataset, dataset_statistics["num_trajectories"], dataset_statistics
+# === Core Initializer ===
+def make_interleaved_dataset(
+    dataset_kwargs_list: List[Dict],
+    sample_weights: Optional[List[float]] = None,
+    *,
+    train: bool,
+    shuffle_buffer_size: int,
+    shuffle_seed:int,
+    traj_transform_kwargs: Optional[Dict] = None,
+    frame_transform_kwargs: Optional[Dict] = None,
+    batch_size: Optional[int] = None,
+    balance_weights: bool = False,
+    traj_transform_threads: Optional[int] = None,
+    traj_read_threads: Optional[int] = None,
+) -> dl.DLataset:
+    """
+    Creates an interleaved dataset from list of dataset configs (kwargs). Returns a dataset of batched frames.
+    Args:
+        dataset_kwargs_list: list of kwargs, each element of which is passed to `make_dataset_from_rlds`.
+            "num_parallel_calls" and "num_parallel_reads" are overridden using `traj_transform_threads` and
+            `traj_read_threads`, respectively.
+        sample_weights: sampling weights for each dataset in list. If None, defaults to uniform.
+        train: whether this is a training or validation dataset.
+        shuffle_buffer_size: size of the dataset shuffle buffer (in number of frames).
+        traj_transform_kwargs: kwargs passed to `apply_trajectory_transforms`. "num_parallel_calls" is
+            overridden using `traj_transform_threads`.
+        frame_transform_kwargs: kwargs passed to `apply_frame_transforms`.
+        batch_size: batch size, if not provided output is not batched.
+        balance_weights: if True, the sample weights are multiplied by the number of frames in each dataset.
+            This makes it so that, if all the sample weights are equal, one full iteration through the interleaved
+            dataset will correspond to one full iteration through each individual dataset (only in expectation,
+            since in practice the sampling is random).
+        traj_transform_threads: total number of parallel calls for trajectory transforms, distributed across
+            datasets according to their sampling weights. If None, defaults to AUTOTUNE for every dataset.
+        traj_read_threads: total number of parallel read workers for trajectory transforms, distributed across
+            datasets according to their sampling weights. If None, defaults to AUTOTUNE for every dataset.
+    """
+    # Default to uniform sampling (if `sample_weights` is not specified)
+    if not sample_weights:
+        sample_weights = [1.0] * len(dataset_kwargs_list)
+    if len(sample_weights) != len(dataset_kwargs_list):
+        raise ValueError(f"sample_weights must be None or have length {len(dataset_kwargs_list)}.")
+    # Check valid `traj_transform_kwargs` and `frame_transform_kwargs`
+    if (traj_transform_kwargs is None) or (frame_transform_kwargs is None):
+        raise ValueError("Missing `traj_transform_kwargs` and `frame_transform_kwargs`!")
+    # Get Dataset Sizes
+    dataset_sizes, all_dataset_statistics = [], {}
+    for dataset_kwargs in dataset_kwargs_list:
+        data_kwargs = copy.deepcopy(dataset_kwargs)
+        if "dataset_frame_transform_kwargs" in data_kwargs:
+            data_kwargs.pop("dataset_frame_transform_kwargs")
+        _, dataset_statistics = make_dataset_from_rlds(**data_kwargs, train=train, shuffle_seed = shuffle_seed)
+        dataset_sizes.append(dataset_statistics["num_transitions"])
+        all_dataset_statistics[dataset_kwargs["name"]] = dataset_statistics
+    # Get the indices of the "primary" datasets (i.e., datasets with sample_weight == 1.0)
+    primary_dataset_indices = np.array([idx for idx in range(len(sample_weights)) if sample_weights[idx] == 1.0])
+    # Balance and Normalize Weights
+    if balance_weights:
+        sample_weights = np.array(sample_weights) * np.array(dataset_sizes)
+    sample_weights = np.array(sample_weights) / np.sum(sample_weights)
+    pprint_data_mixture(dataset_kwargs_list, sample_weights)
+    # Effective Dataset Length = Number of samples until each dataset has completed at least one epoch
+    #   =>> Note :: Only counting the "primary" datasets (i.e., datasets with sample_weight == 1.0)
+    dataset_len = int((np.array(dataset_sizes) / sample_weights)[primary_dataset_indices].max())
+    # Allocate Threads based on Weights
+    threads_per_dataset = allocate_threads(traj_transform_threads, sample_weights)
+    reads_per_dataset = allocate_threads(traj_read_threads, sample_weights)
+    overwatch.info("Threads per Dataset: %s", threads_per_dataset)
+    overwatch.info("Reads per Dataset: %s", reads_per_dataset)
+    # Construct Datasets
+    overwatch.info("Constructing datasets...")
+    datasets = []
+    for dataset_kwargs, threads, reads in zip(
+        dataset_kwargs_list,
+        threads_per_dataset,
+        reads_per_dataset,
+    ):
+        dataset_frame_transform_kwargs = (
+            dataset_kwargs.pop("dataset_frame_transform_kwargs")
+            if "dataset_frame_transform_kwargs" in dataset_kwargs
+            else {}
+        )
+        dataset, _ = make_dataset_from_rlds(
+            **dataset_kwargs,
+            train=train,
+            shuffle_seed=shuffle_seed,
+            num_parallel_calls=threads,
+            num_parallel_reads=reads,
+            dataset_statistics=all_dataset_statistics[dataset_kwargs["name"]],
+        )
+        dataset = apply_trajectory_transforms(
+            dataset.repeat(),
+            **traj_transform_kwargs,
+            num_parallel_calls=threads,
+            train=train,
+        ).flatten(num_parallel_calls=threads)
+        dataset = apply_per_dataset_frame_transforms(dataset, **dataset_frame_transform_kwargs)
+        datasets.append(dataset)
+    # Interleave at the Frame Level
+    dataset: dl.DLataset = dl.DLataset.sample_from_datasets(datasets, sample_weights, seed=shuffle_seed)
+    # Validation =>> fix a single shuffle buffer of data and cache it in RAM; prevents gradual memory increase!
+    if not train:
+        dataset = dataset.take(shuffle_buffer_size).cache()
+    # Shuffle the Dataset
+    #   =>> IMPORTANT :: Shuffle AFTER .cache(), or else memory will still leak!
+    dataset = dataset.shuffle(shuffle_buffer_size, seed=shuffle_seed)
+    # Apply Frame Transforms
+    overwatch.info("Applying frame transforms on dataset...")
+    dataset = apply_frame_transforms(dataset, **frame_transform_kwargs, train=train)
+    # [Contract] When training VLA Policies, we let the Collator handle Batching!
+    if batch_size is not None:
+        dataset = dataset.batch(batch_size)
+    # Note =>> Seems to reduce memory usage without affecting speed?
+    dataset = dataset.with_ram_budget(1)
+    # Save for Later
+    dataset.sample_weights = sample_weights
+    return dataset, dataset_len, all_dataset_statistics

prismatic/vla/datasets/rlds/oxe/transforms.py ADDED Viewed

	@@ -0,0 +1,951 @@

+"""
+transforms.py
+Defines a registry of per-dataset standardization transforms for each dataset in Open-X Embodiment.
+Transforms adopt the following structure:
+    Input: Dictionary of *batched* features (i.e., has leading time dimension)
+    Output: Dictionary `step` =>> {
+        "observation": {
+            <image_keys, depth_image_keys>
+            State (in chosen state representation)
+        },
+        "action": Action (in chosen action representation),
+        "language_instruction": str
+    }
+"""
+from typing import Any, Dict
+import tensorflow as tf
+from prismatic.vla.datasets.rlds.oxe.utils.droid_utils import droid_baseact_transform, droid_finetuning_transform
+from prismatic.vla.datasets.rlds.utils.data_utils import (
+    binarize_gripper_actions,
+    invert_gripper_actions,
+    rel2abs_gripper_actions,
+    relabel_bridge_actions,
+)
+def bridge_oxe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to version of Bridge V2 in Open X-Embodiment mixture.
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory.keys():
+        if key == "traj_metadata":
+            continue
+        elif key in ["observation", "action"]:
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def bridge_orig_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Applies to original version of Bridge V2 from the official project website.
+    Note =>> In original Bridge V2 dataset, the first timestep has an all-zero action, so we remove it!
+    """
+    for key in trajectory.keys():
+        if key == "traj_metadata":
+            continue
+        elif key == "observation":
+            for key2 in trajectory[key]:
+                trajectory[key][key2] = trajectory[key][key2][1:]
+        else:
+            trajectory[key] = trajectory[key][1:]
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory = relabel_bridge_actions(trajectory)
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def ppgm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
+    return trajectory
+def rt1_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def kuka_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    # decode compressed state
+    eef_value = tf.io.decode_compressed(
+        trajectory["observation"]["clip_function_input/base_pose_tool_reached"],
+        compression_type="ZLIB",
+    )
+    eef_value = tf.io.decode_raw(eef_value, tf.float32)
+    trajectory["observation"]["clip_function_input/base_pose_tool_reached"] = tf.reshape(eef_value, (-1, 7))
+    gripper_value = tf.io.decode_compressed(trajectory["observation"]["gripper_closed"], compression_type="ZLIB")
+    gripper_value = tf.io.decode_raw(gripper_value, tf.float32)
+    trajectory["observation"]["gripper_closed"] = tf.reshape(gripper_value, (-1, 1))
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def taco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["robot_obs"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["robot_obs"][:, 7:8]
+    trajectory["action"] = trajectory["action"]["rel_actions_world"]
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.clip_by_value(trajectory["action"][:, -1:], 0, 1),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def jaco_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state_eef"] = trajectory["observation"]["end_effector_cartesian_pos"][:, :6]
+    trajectory["observation"]["state_gripper"] = trajectory["observation"]["end_effector_cartesian_pos"][:, -1:]
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            tf.zeros_like(trajectory["action"]["world_vector"]),
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def berkeley_cable_routing_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.zeros_like(trajectory["action"]["world_vector"][:, :1]),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def roboturk_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert absolute gripper action, +1 = open, 0 = close
+    gripper_action = invert_gripper_actions(tf.clip_by_value(trajectory["action"]["gripper_closedness_action"], 0, 1))
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def nyu_door_opening_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, 0]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def viola_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # make gripper action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"][:, None]
+    gripper_action = tf.clip_by_value(gripper_action, 0, 1)
+    gripper_action = invert_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def berkeley_autolab_ur5_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["robot_state"][:, 6:14]
+    trajectory["observation"]["depth"] = trajectory["observation"].pop("image_with_depth")
+    # make gripper action absolute action, +1 = open, 0 = close
+    gripper_action = trajectory["action"]["gripper_closedness_action"]
+    gripper_action = rel2abs_gripper_actions(gripper_action)
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            gripper_action[:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def toto_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            tf.cast(trajectory["action"]["open_gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["observation"]["natural_language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def language_table_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # default to "open" gripper
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.ones_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    # decode language instruction
+    instruction_bytes = trajectory["observation"]["instruction"]
+    instruction_encoded = tf.strings.unicode_encode(instruction_bytes, output_encoding="UTF-8")
+    # Remove trailing padding --> convert RaggedTensor to regular Tensor.
+    trajectory["language_instruction"] = tf.strings.split(instruction_encoded, "\x00")[:, :1].to_tensor()[:, 0]
+    return trajectory
+def pusht_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["world_vector"],
+            trajectory["action"]["rotation_delta"],
+            trajectory["action"]["gripper_closedness_action"][:, None],
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def stanford_kuka_multimodal_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth_image"] = trajectory["observation"]["depth_image"][..., 0]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def nyu_rot_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][..., :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., -1:]
+    trajectory["action"] = trajectory["action"][..., :7]
+    return trajectory
+def stanford_hydra_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            trajectory["observation"]["state"][:, 7:10],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -3:-2]
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def austin_buds_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def nyu_franka_play_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["depth"] = tf.cast(trajectory["observation"]["depth"][..., 0], tf.float32)
+    trajectory["observation"]["depth_additional_view"] = tf.cast(
+        trajectory["observation"]["depth_additional_view"][..., 0], tf.float32
+    )
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, -6:]
+    # clip gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, -8:-2],
+            tf.clip_by_value(trajectory["action"][:, -2:-1], 0, 1),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def maniskill_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][..., 7:8]
+    return trajectory
+def furniture_bench_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :7],
+            trajectory["observation"]["state"][:, -1:],
+        ),
+        axis=-1,
+    )
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_franka_exploration_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def ucsd_kitchen_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def ucsd_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tf.zeros_like(trajectory["action"][:, :3]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def austin_sailor_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def austin_sirius_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def bc_z_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["future/xyz_residual"][:, :3],
+            trajectory["action"]["future/axis_angle_residual"][:, :3],
+            invert_gripper_actions(tf.cast(trajectory["action"]["future/target_close"][:, :1], tf.float32)),
+        ),
+        axis=-1,
+    )
+    trajectory["language_instruction"] = trajectory["observation"]["natural_language_instruction"]
+    return trajectory
+def tokyo_pr2_opening_fridge_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def tokyo_pr2_tabletop_manipulation_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def utokyo_xarm_pick_place_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def utokyo_xarm_bimanual_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., -7:]
+    return trajectory
+def robo_net_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :4],
+            tf.zeros_like(trajectory["observation"]["state"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def berkeley_mvp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def berkeley_rpt_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def kaist_nonprehensible_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, -7:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def stanford_mask_vit_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["end_effector_pose"][:, :4],
+            tf.zeros_like(trajectory["observation"]["end_effector_pose"][:, :2]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["end_effector_pose"][:, -1:]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :4],
+            tf.zeros_like(trajectory["action"][:, :2]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def tokyo_lsmo_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def dlr_sara_pour_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    return trajectory
+def dlr_sara_grid_clamp_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :6]
+    return trajectory
+def dlr_edan_shared_control_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # invert gripper action, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(trajectory["action"][:, -1:]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def asu_table_top_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["ground_truth_states"]["EE"]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def robocook_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    return trajectory
+def imperial_wristcam_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def iamlab_pick_insert_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :7]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 7:8]
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, 7:8],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def uiuc_d3field_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def utaustin_mutex_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = trajectory["observation"]["state"][:, :8]
+    # invert gripper action + clip, +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :6],
+            invert_gripper_actions(tf.clip_by_value(trajectory["action"][:, -1:], 0, 1)),
+        ),
+        axis=-1,
+    )
+    # trajectory["language_instruction"] = tf.fill(
+    #     tf.shape(trajectory["language_instruction"]), ""
+    # )  # delete uninformative language instruction
+    return trajectory
+def berkeley_fanuc_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["joint_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, 6:7]
+    # dataset does not store gripper actions, so use gripper state info, invert so +1 = open, 0 = close
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            invert_gripper_actions(trajectory["observation"]["gripper_state"]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_playing_with_food_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    import tensorflow_graphics.geometry.transformation as tft
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            tft.euler.from_quaternion(trajectory["action"][:, 3:7]),
+            trajectory["action"][:, -1:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def playfusion_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :3],
+            trajectory["action"][:, -4:],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def cmu_stretch_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["eef_state"] = tf.concat(
+        (
+            trajectory["observation"]["state"][:, :3],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -1:]
+    trajectory["action"] = trajectory["action"][..., :-1]
+    return trajectory
+def gnm_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["observation"]["state"] = tf.concat(
+        (
+            trajectory["observation"]["position"],
+            tf.zeros_like(trajectory["observation"]["state"][:, :3]),
+            trajectory["observation"]["yaw"],
+        ),
+        axis=-1,
+    )
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"],
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"]),
+            tf.zeros_like(trajectory["action"][:, :1]),
+        ),
+        axis=-1,
+    )
+    return trajectory
+def fmb_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["eef_pose"],
+            trajectory["observation"]["state_gripper_pose"][..., None],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def dobbe_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
+    return trajectory
+def roboset_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # every input feature is batched, ie has leading batch dimension
+    trajectory["observation"]["proprio"] = trajectory["observation"]["state"]
+    # gripper action is in -1...1 --> clip to 0...1, flip
+    gripper_action = trajectory["action"][:, -1:]
+    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"][:, :7],
+            gripper_action,
+        ),
+        axis=-1,
+    )
+    return trajectory
+def rh20t_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        (
+            trajectory["action"]["tcp_base"],
+            tf.cast(trajectory["action"]["gripper"][:, None], tf.float32),
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["tcp_base"],
+            trajectory["observation"]["gripper_width"][..., None],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def tdroid_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            binarize_gripper_actions(trajectory["action"][:, -1])[:, None],
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["cartesian_position"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["gripper_position"][:, -1:]
+    return trajectory
+def libero_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # gripper action is in -1 (open)...1 (close) --> clip to 0...1, flip --> +1 = open, 0 = close
+    gripper_action = trajectory["action"][:, -1:]
+    gripper_action = invert_gripper_actions(tf.clip_by_value(gripper_action, 0, 1))
+    trajectory["action"] = tf.concat(
+        [
+            trajectory["action"][:, :6],
+            gripper_action,
+        ],
+        axis=1,
+    )
+    trajectory["observation"]["EEF_state"] = trajectory["observation"]["state"][:, :6]
+    trajectory["observation"]["gripper_state"] = trajectory["observation"]["state"][:, -2:]  # 2D gripper state
+    return trajectory
+def aloha_dataset_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    # Don't need to do anything because dataset is already in the correct format
+    return trajectory
+# === Registry ===
+OXE_STANDARDIZATION_TRANSFORMS = {
+    "bridge_oxe": bridge_oxe_dataset_transform,
+    "bridge_orig": bridge_orig_dataset_transform,
+    "bridge_dataset": bridge_orig_dataset_transform,
+    "ppgm": ppgm_dataset_transform,
+    "ppgm_static": ppgm_dataset_transform,
+    "ppgm_wrist": ppgm_dataset_transform,
+    "fractal20220817_data": rt1_dataset_transform,
+    "kuka": kuka_dataset_transform,
+    "taco_play": taco_play_dataset_transform,
+    "jaco_play": jaco_play_dataset_transform,
+    "berkeley_cable_routing": berkeley_cable_routing_dataset_transform,
+    "roboturk": roboturk_dataset_transform,
+    "nyu_door_opening_surprising_effectiveness": nyu_door_opening_dataset_transform,
+    "viola": viola_dataset_transform,
+    "berkeley_autolab_ur5": berkeley_autolab_ur5_dataset_transform,
+    "toto": toto_dataset_transform,
+    "language_table": language_table_dataset_transform,
+    "columbia_cairlab_pusht_real": pusht_dataset_transform,
+    "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": stanford_kuka_multimodal_dataset_transform,
+    "nyu_rot_dataset_converted_externally_to_rlds": nyu_rot_dataset_transform,
+    "stanford_hydra_dataset_converted_externally_to_rlds": stanford_hydra_dataset_transform,
+    "austin_buds_dataset_converted_externally_to_rlds": austin_buds_dataset_transform,
+    "nyu_franka_play_dataset_converted_externally_to_rlds": nyu_franka_play_dataset_transform,
+    "maniskill_dataset_converted_externally_to_rlds": maniskill_dataset_transform,
+    "furniture_bench_dataset_converted_externally_to_rlds": furniture_bench_dataset_transform,
+    "cmu_franka_exploration_dataset_converted_externally_to_rlds": cmu_franka_exploration_dataset_transform,
+    "ucsd_kitchen_dataset_converted_externally_to_rlds": ucsd_kitchen_dataset_transform,
+    "ucsd_pick_and_place_dataset_converted_externally_to_rlds": ucsd_pick_place_dataset_transform,
+    "austin_sailor_dataset_converted_externally_to_rlds": austin_sailor_dataset_transform,
+    "austin_sirius_dataset_converted_externally_to_rlds": austin_sirius_dataset_transform,
+    "bc_z": bc_z_dataset_transform,
+    "utokyo_pr2_opening_fridge_converted_externally_to_rlds": tokyo_pr2_opening_fridge_dataset_transform,
+    "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": tokyo_pr2_tabletop_manipulation_dataset_transform,
+    "utokyo_xarm_pick_and_place_converted_externally_to_rlds": utokyo_xarm_pick_place_dataset_transform,
+    "utokyo_xarm_bimanual_converted_externally_to_rlds": utokyo_xarm_bimanual_dataset_transform,
+    "robo_net": robo_net_dataset_transform,
+    "berkeley_mvp_converted_externally_to_rlds": berkeley_mvp_dataset_transform,
+    "berkeley_rpt_converted_externally_to_rlds": berkeley_rpt_dataset_transform,
+    "kaist_nonprehensile_converted_externally_to_rlds": kaist_nonprehensible_dataset_transform,
+    "stanford_mask_vit_converted_externally_to_rlds": stanford_mask_vit_dataset_transform,
+    "tokyo_u_lsmo_converted_externally_to_rlds": tokyo_lsmo_dataset_transform,
+    "dlr_sara_pour_converted_externally_to_rlds": dlr_sara_pour_dataset_transform,
+    "dlr_sara_grid_clamp_converted_externally_to_rlds": dlr_sara_grid_clamp_dataset_transform,
+    "dlr_edan_shared_control_converted_externally_to_rlds": dlr_edan_shared_control_dataset_transform,
+    "asu_table_top_converted_externally_to_rlds": asu_table_top_dataset_transform,
+    "stanford_robocook_converted_externally_to_rlds": robocook_dataset_transform,
+    "imperialcollege_sawyer_wrist_cam": imperial_wristcam_dataset_transform,
+    "iamlab_cmu_pickup_insert_converted_externally_to_rlds": iamlab_pick_insert_dataset_transform,
+    "uiuc_d3field": uiuc_d3field_dataset_transform,
+    "utaustin_mutex": utaustin_mutex_dataset_transform,
+    "berkeley_fanuc_manipulation": berkeley_fanuc_dataset_transform,
+    "cmu_playing_with_food": cmu_playing_with_food_dataset_transform,
+    "cmu_play_fusion": playfusion_dataset_transform,
+    "cmu_stretch": cmu_stretch_dataset_transform,
+    "berkeley_gnm_recon": gnm_dataset_transform,
+    "berkeley_gnm_cory_hall": gnm_dataset_transform,
+    "berkeley_gnm_sac_son": gnm_dataset_transform,
+    "droid": droid_baseact_transform,
+    "fmb_dataset": fmb_dataset_transform,
+    "dobbe": dobbe_dataset_transform,
+    "roboset": roboset_dataset_transform,
+    "rh20t": rh20t_dataset_transform,
+    ### T-DROID datasets
+    "tdroid_carrot_in_bowl": tdroid_dataset_transform,
+    "tdroid_pour_corn_in_pot": tdroid_dataset_transform,
+    "tdroid_flip_pot_upright": tdroid_dataset_transform,
+    "tdroid_move_object_onto_plate": tdroid_dataset_transform,
+    "tdroid_knock_object_over": tdroid_dataset_transform,
+    "tdroid_cover_object_with_towel": tdroid_dataset_transform,
+    ### DROID Finetuning datasets
+    "droid_wipe": droid_finetuning_transform,
+    ### LIBERO datasets (modified versions)
+    "libero_spatial_no_noops": libero_dataset_transform,
+    "libero_object_no_noops": libero_dataset_transform,
+    "libero_goal_no_noops": libero_dataset_transform,
+    "libero_10_no_noops": libero_dataset_transform,
+    "libero_4_task_suites_no_noops": libero_dataset_transform,
+    ### ALOHA fine-tuning datasets
+    "aloha1_fold_shorts_20_demos": aloha_dataset_transform,
+    "aloha1_fold_shirt_30_demos": aloha_dataset_transform,
+    "aloha1_scoop_X_into_bowl_45_demos": aloha_dataset_transform,
+    "aloha1_put_X_into_pot_300_demos": aloha_dataset_transform,
+    "aloha_dual_bottles_pick_hard_d435_20": aloha_dataset_transform,
+    # robotwin2
+    "grab_roller_aloha_agilex_50": aloha_dataset_transform,
+    "handover_mic_aloha_agilex_50": aloha_dataset_transform,
+    "lift_pot_aloha_agilex_50": aloha_dataset_transform,
+    "move_can_pot_aloha_agilex_50": aloha_dataset_transform,
+    "open_laptop_aloha_agilex_50": aloha_dataset_transform,
+    "pick_dual_bottles_aloha_agilex_50":aloha_dataset_transform,
+    "place_dual_shoes_aloha_agilex_50": aloha_dataset_transform,
+    "place_object_basket_aloha_agilex_50": aloha_dataset_transform,
+    "place_phone_stand_aloha_agilex_50": aloha_dataset_transform,
+    "put_bottles_dustbin_aloha_agilex_50": aloha_dataset_transform,
+    "put_object_cabinet_aloha_agilex_50": aloha_dataset_transform,
+    "stack_blocks_two_aloha_agilex_50": aloha_dataset_transform,
+    "stack_bowls_two_aloha_agilex_50": aloha_dataset_transform,
+}

prismatic/vla/datasets/rlds/oxe/utils/droid_utils.py ADDED Viewed

	@@ -0,0 +1,178 @@

+"""Episode transforms for DROID dataset."""
+from typing import Any, Dict
+import tensorflow as tf
+import tensorflow_graphics.geometry.transformation as tfg
+def rmat_to_euler(rot_mat):
+    return tfg.euler.from_rotation_matrix(rot_mat)
+def euler_to_rmat(euler):
+    return tfg.rotation_matrix_3d.from_euler(euler)
+def invert_rmat(rot_mat):
+    return tfg.rotation_matrix_3d.inverse(rot_mat)
+def rotmat_to_rot6d(mat):
+    """
+    Converts rotation matrix to R6 rotation representation (first two rows in rotation matrix).
+    Args:
+        mat: rotation matrix
+    Returns: 6d vector (first two rows of rotation matrix)
+    """
+    r6 = mat[..., :2, :]
+    r6_0, r6_1 = r6[..., 0, :], r6[..., 1, :]
+    r6_flat = tf.concat([r6_0, r6_1], axis=-1)
+    return r6_flat
+def velocity_act_to_wrist_frame(velocity, wrist_in_robot_frame):
+    """
+    Translates velocity actions (translation + rotation) from base frame of the robot to wrist frame.
+    Args:
+        velocity: 6d velocity action (3 x translation, 3 x rotation)
+        wrist_in_robot_frame: 6d pose of the end-effector in robot base frame
+    Returns: 9d velocity action in robot wrist frame (3 x translation, 6 x rotation as R6)
+    """
+    R_frame = euler_to_rmat(wrist_in_robot_frame[:, 3:6])
+    R_frame_inv = invert_rmat(R_frame)
+    # world to wrist: dT_pi = R^-1 dT_rbt
+    vel_t = (R_frame_inv @ velocity[:, :3][..., None])[..., 0]
+    # world to wrist: dR_pi = R^-1 dR_rbt R
+    dR = euler_to_rmat(velocity[:, 3:6])
+    dR = R_frame_inv @ (dR @ R_frame)
+    dR_r6 = rotmat_to_rot6d(dR)
+    return tf.concat([vel_t, dR_r6], axis=-1)
+def rand_swap_exterior_images(img1, img2):
+    """
+    Randomly swaps the two exterior images (for training with single exterior input).
+    """
+    return tf.cond(tf.random.uniform(shape=[]) > 0.5, lambda: (img1, img2), lambda: (img2, img1))
+def droid_baseact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dR,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def droid_wristact_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *wrist* frame of the robot.
+    """
+    wrist_act = velocity_act_to_wrist_frame(
+        trajectory["action_dict"]["cartesian_velocity"], trajectory["observation"]["cartesian_position"]
+    )
+    trajectory["action"] = tf.concat(
+        (
+            wrist_act,
+            trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["exterior_image_1_left"], trajectory["observation"]["exterior_image_2_left"] = (
+        rand_swap_exterior_images(
+            trajectory["observation"]["exterior_image_1_left"],
+            trajectory["observation"]["exterior_image_2_left"],
+        )
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def droid_finetuning_transform(trajectory: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    DROID dataset transformation for actions expressed in *base* frame of the robot.
+    """
+    dt = trajectory["action_dict"]["cartesian_velocity"][:, :3]
+    dR = trajectory["action_dict"]["cartesian_velocity"][:, 3:6]
+    trajectory["action"] = tf.concat(
+        (
+            dt,
+            dR,
+            1 - trajectory["action_dict"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    trajectory["observation"]["proprio"] = tf.concat(
+        (
+            trajectory["observation"]["cartesian_position"],
+            trajectory["observation"]["gripper_position"],
+        ),
+        axis=-1,
+    )
+    return trajectory
+def zero_action_filter(traj: Dict) -> bool:
+    """
+    Filters transitions whose actions are all-0 (only relative actions, no gripper action).
+    Note: this filter is applied *after* action normalization, so need to compare to "normalized 0".
+    """
+    DROID_Q01 = tf.convert_to_tensor(
+        [
+            -0.7776297926902771,
+            -0.5803514122962952,
+            -0.5795090794563293,
+            -0.6464047729969025,
+            -0.7041108310222626,
+            -0.8895104378461838,
+        ]
+    )
+    DROID_Q99 = tf.convert_to_tensor(
+        [
+            0.7597932070493698,
+            0.5726242214441299,
+            0.7351000607013702,
+            0.6705610305070877,
+            0.6464948207139969,
+            0.8897542208433151,
+        ]
+    )
+    DROID_NORM_0_ACT = 2 * (tf.zeros_like(traj["action"][:, :6]) - DROID_Q01) / (DROID_Q99 - DROID_Q01 + 1e-8) - 1
+    return tf.reduce_any(tf.math.abs(traj["action"][:, :6] - DROID_NORM_0_ACT) > 1e-5)

prismatic/vla/datasets/rlds/utils/__init__.py ADDED Viewed

File without changes

prismatic/vla/datasets/rlds/utils/goal_relabeling.py ADDED Viewed

	@@ -0,0 +1,32 @@

+"""
+goal_relabeling.py
+Contains simple goal relabeling logic for BC use-cases where rewards and next_observations are not required.
+Each function should add entries to the "task" dict.
+"""
+from typing import Dict
+import tensorflow as tf
+from prismatic.vla.datasets.rlds.utils.data_utils import tree_merge
+def uniform(traj: Dict) -> Dict:
+    """Relabels with a true uniform distribution over future states."""
+    traj_len = tf.shape(tf.nest.flatten(traj["observation"])[0])[0]
+    # Select a random future index for each transition i in the range [i + 1, traj_len)
+    rand = tf.random.uniform([traj_len])
+    low = tf.cast(tf.range(traj_len) + 1, tf.float32)
+    high = tf.cast(traj_len, tf.float32)
+    goal_idxs = tf.cast(rand * (high - low) + low, tf.int32)
+    # Sometimes there are floating-point errors that cause an out-of-bounds
+    goal_idxs = tf.minimum(goal_idxs, traj_len - 1)
+    # Adds keys to "task" mirroring "observation" keys (`tree_merge` to combine "pad_mask_dict" properly)
+    goal = tf.nest.map_structure(lambda x: tf.gather(x, goal_idxs), traj["observation"])
+    traj["task"] = tree_merge(traj["task"], goal)
+    return traj

prismatic/vla/materialize.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+materialize.py
+Factory class for initializing Open-X RLDS-backed datasets, given specified data mixture parameters; provides and
+exports individual functions for clear control flow.
+"""
+from pathlib import Path
+from typing import Tuple, Type
+from torch.utils.data import Dataset
+from transformers import PreTrainedTokenizerBase
+from prismatic.models.backbones.llm.prompting import PromptBuilder
+from prismatic.models.backbones.vision import ImageTransform
+from prismatic.util.data_utils import PaddedCollatorForActionPrediction
+from prismatic.vla.action_tokenizer import ActionTokenizer
+from prismatic.vla.datasets import EpisodicRLDSDataset, RLDSBatchTransform, RLDSDataset
+def get_vla_dataset_and_collator(
+    data_root_dir: Path,
+    data_mix: str,
+    image_transform: ImageTransform,
+    tokenizer: PreTrainedTokenizerBase,
+    prompt_builder_fn: Type[PromptBuilder],
+    default_image_resolution: Tuple[int, int, int],
+    padding_side: str = "right",
+    predict_stop_token: bool = True,
+    shuffle_buffer_size: int = 100_000,
+    train: bool = True,
+    episodic: bool = False,
+    image_aug: bool = False,
+) -> Tuple[Dataset, ActionTokenizer, PaddedCollatorForActionPrediction]:
+    """Initialize RLDS Dataset (wraps TFDS), ActionTokenizer, and initialize transform/collation functions."""
+    action_tokenizer = ActionTokenizer(tokenizer)
+    batch_transform = RLDSBatchTransform(
+        action_tokenizer, tokenizer, image_transform, prompt_builder_fn, predict_stop_token=predict_stop_token
+    )
+    collator = PaddedCollatorForActionPrediction(
+        tokenizer.model_max_length, tokenizer.pad_token_id, padding_side=padding_side
+    )
+    # Build RLDS Iterable Dataset
+    cls = RLDSDataset if not episodic else EpisodicRLDSDataset
+    dataset = cls(
+        data_root_dir,
+        data_mix,
+        batch_transform,
+        resize_resolution=default_image_resolution[1:],
+        shuffle_buffer_size=shuffle_buffer_size,
+        train=train,
+        image_aug=image_aug,
+    )
+    return dataset, action_tokenizer, collator

run_scripts/ac/ac.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla_ffn_AC
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=5000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/3ffn2.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla3_ffn
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/3postffn2.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla3_postffn
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=postffn
+decoder_num_blocks=2
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/3postffn6.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla3_postffn
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=postffn
+decoder_num_blocks=6
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/debug_5ffn_withactionprojector.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla4_ffn_withprojector
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline python -m debugpy --listen 1234 --wait-for-client '/opt/conda/envs/spatialvla/bin/torchrun' --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-5 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/ffn4.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla_ffn
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=4
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=5000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/ffn8.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla_ffn
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=8
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=5000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn/test.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=test
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=5000
+max_steps=40000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn_long_chunks/run.sh ADDED Viewed

	@@ -0,0 +1,4 @@

+bash run_scripts/ffn_long_chunks/li4.sh
+bash run_scripts/ffn_long_chunks/li16.sh
+bash run_scripts/ffn_long_chunks/li24.sh
+bash run_scripts/ffn_long_chunks/li32.sh

run_scripts/ffn_q2a/aloha/test_aloha_robotwin2_ffn_25_base.sh ADDED Viewed

	@@ -0,0 +1,88 @@

+#========== settings ==========#
+PROJECT_PATH=simvla_twin2
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137
+#========== !NOTE! ==========#
+RUN_MODE=base
+use_predict_future_prop=False
+batch_size=4
+use_action_ts_head=False
+use_one_embed=False
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=aloha
+MODE=${RUN_MODE}_robot_platform_${robot_platform}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=3
+wandb_entity=chenghaha
+wandb_project=robotwin
+wandb_log_freq=1
+use_proprio=True
+use_diffusion=False
+use_film=True
+num_steps_before_decay=1000
+save_freq=2000
+max_steps=2000
+vla_path=$ROOT_PATH/ai_models/openvla/openvla-7b
+data_root_dir=$ROOT_PATH/datasets/TianxingChen/RoboTwin2.0/tfds
+dataset_name=grab_roller_aloha_agilex_50
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+if [ "$use_l1_regression" = "True" ]; then
+    note_parts+=("L1_regression")
+fi
+if [ "$num_images_in_input" == 1 ]; then
+    note_parts+=("3rd_person_img")
+else
+    note_parts+=("3rd_person_img_and_wrist")
+fi
+if [ "$use_l1_regression" = "True" ]; then
+    note_parts+=("proprio_state")
+fi
+if [ "$use_film" = "True" ]; then
+    note_parts+=("Film")
+fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+source activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-5 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/ffn_q2a/aloha/test_aloha_robotwin2_ffn_50_l2.sh ADDED Viewed

	@@ -0,0 +1,102 @@

+#========== settings ==========#
+PROJECT_PATH=simvla_twin2
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137
+#========== !NOTE! ==========#
+RUN_MODE=simvla_50
+use_predict_future_prop=False
+batch_size=4
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=4
+robot_platform=50_al
+proj_type=gelu_linear
+ffn_type=gelu
+expand_inner_ratio=1
+linear_drop_ratio=0.1
+multi_queries_num=50
+multi_query_norm_type=layernorm
+action_norm=l2
+MODE=${RUN_MODE}_inner${expand_inner_ratio}_proj_type_${proj_type}_ffn_type_${ffn_type}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=3
+wandb_entity=chenghaha
+wandb_project=robotwin
+wandb_log_freq=1
+use_proprio=True
+use_diffusion=False
+use_film=True
+num_steps_before_decay=2000
+save_freq=3000
+max_steps=3000
+vla_path=$ROOT_PATH/ai_models/openvla/openvla-7b
+data_root_dir=$ROOT_PATH/datasets/TianxingChen/RoboTwin2.0/tfds
+dataset_name=grab_roller_aloha_agilex_50
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-5 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform" \
+  --proj_type "$proj_type" \
+  --ffn_type "$ffn_type" \
+  --expand_inner_ratio "$expand_inner_ratio" \
+  --linear_drop_ratio "$linear_drop_ratio" \
+  --multi_query_norm_type "$multi_query_norm_type" \
+  --multi_queries_num "$multi_queries_num" \
+  --action_norm "$action_norm"

run_scripts/ffn_q2a/bridge/exffn_relu_connector_linear_relu.sh ADDED Viewed

	@@ -0,0 +1,95 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_q2a
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137
+#========== !NOTE! ==========#
+RUN_MODE=simvla_q2a
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=6
+robot_platform=bridge
+without_head_drop_out=True
+proj_type=linear_relu
+ffn_type=relu
+expand_actiondim_ratio=2.0
+MODE=${RUN_MODE}_exffn${expand_actiondim_ratio}_proj_type_${proj_type}_ffn_type_${ffn_type}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=50000
+vla_path=$ROOT_PATH/ai_models/openvla/openvla-7b
+data_root_dir=$ROOT_PATH/datasets/openx/data/origin
+dataset_name=bridge
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform" \
+  --proj_type "$proj_type" \
+  --ffn_type "$ffn_type" \
+  --expand_actiondim_ratio "$expand_actiondim_ratio"

run_scripts/ffn_q2a/bridge/run_bridge.sh ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ bash run_scripts/ffn_q2a/bridge/exffn_gelu_bridge_drop0_5.sh
2	+ bash run_scripts/ffn_q2a/bridge/exffn_gelu_bridge.sh

run_scripts/ffn_q2a/franka/exffn_gelu_franka.sh ADDED Viewed

	@@ -0,0 +1,95 @@

+#========== settings ==========#
+PROJECT_PATH=SimVLA_Condition
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137
+#========== !NOTE! ==========#
+RUN_MODE=simvla_q2a
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=4
+robot_platform=rt1
+without_head_drop_out=True
+proj_type=gelu_linear
+ffn_type=gelu
+expand_actiondim_ratio=1.0
+MODE=${RUN_MODE}_exffn${expand_actiondim_ratio}_proj_type_${proj_type}_ffn_type_${ffn_type}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=30000
+save_freq=10000
+max_steps=60000
+vla_path=$ROOT_PATH/ai_models/openvla/openvla-7b
+data_root_dir=$ROOT_PATH/datasets/openx/data/origin
+dataset_name=rt1
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform" \
+  --proj_type "$proj_type" \
+  --ffn_type "$ffn_type" \
+  --expand_actiondim_ratio "$expand_actiondim_ratio"

run_scripts/ffn_q2a/libero_moe/debug_moe_lit.sh ADDED Viewed

	@@ -0,0 +1,101 @@

+#========== settings ==========#
+PROJECT_PATH=SimVLA
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/jiajiuyang-240108580167/chengdongzhou
+#========== !NOTE! ==========#
+RUN_MODE=simvla_q2a_lit
+use_predict_future_prop=False
+batch_size=2
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=moe
+decoder_num_blocks=2
+robot_platform=16_li
+without_head_drop_out=True
+proj_type=gelu_linear
+ffn_type=gelu
+num_experts=8
+expand_inner_ratio=2
+top_k=2
+expand_actiondim_ratio=0.5
+MODE=${RUN_MODE}_ex${expand_actiondim_ratio}_inner${expand_inner_ratio}_proj_type_${proj_type}_ffn_type_${ffn_type}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}_num_experts${num_experts}_top_k{$top_k}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=10000
+max_steps=50000
+vla_path=$ROOT_PATH/ai_models/openvla
+data_root_dir=$ROOT_PATH/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline python -m debugpy --listen 1234 --wait-for-client '/opt/conda/envs/openvla-oft/bin/torchrun' --standalone --nnodes 1 --nproc-per-node 1 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform" \
+  --proj_type "$proj_type" \
+  --ffn_type "$ffn_type" \
+  --expand_inner_ratio "$expand_inner_ratio" \
+  --expand_actiondim_ratio "$expand_actiondim_ratio" \
+  --num_experts "$num_experts" \
+  --top_k "$top_k"

run_scripts/ffn_q2a/simhead/simhead_contrastive.sh ADDED Viewed

	@@ -0,0 +1,100 @@

+#========== settings ==========#
+PROJECT_PATH=SimVLA_Condition
+ROOT_PATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137
+#========== !NOTE! ==========#
+RUN_MODE=simvla_q2a
+use_predict_future_prop=False
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=2
+robot_platform=16_li
+without_head_drop_out=True
+without_action_projector=True
+ffn_type=gelu
+use_l2norm=False
+expand_inner_ratio=2.0
+use_contrastive_loss=True
+MODE=${RUN_MODE}_usecons${use_contrastive_loss}_newexinner_${expand_inner_ratio}_without_ap_ffn_type_${ffn_type}_use_l2norm${use_l2norm}_mlp_${mlp_type}_num_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=False
+use_diffusion=False
+use_film=False
+num_steps_before_decay=30000
+save_freq=10000
+max_steps=50000
+vla_path=$ROOT_PATH/ai_models/openvla/openvla-7b
+data_root_dir=$ROOT_PATH/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=$ROOT_PATH/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+conda activate openvla-oft
+cd $ROOT_PATH/vla_projects/$PROJECT_PATH
+export PYTHONPATH=$ROOT_PATH/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform" \
+  --proj_type "$proj_type" \
+  --ffn_type "$ffn_type" \
+  --use_l2norm "$use_l2norm" \
+  --expand_inner_ratio "$expand_inner_ratio" \
+  --without_action_projector "$without_action_projector" \
+  --use_contrastive_loss "$use_contrastive_loss"

run_scripts/pp/pp.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#========== settings ==========#
+PROJECT_PATH=fastvla_multi_scale_query
+#========== !NOTE! ==========#
+RUN_MODE=simvla_PP
+use_predict_future_prop=True
+batch_size=16
+use_action_ts_head=True
+use_one_embed=True
+use_multi_scaling=False
+mlp_type=ffn
+decoder_num_blocks=4
+robot_platform=libero
+MODE=${RUN_MODE}_use_pp_${use_predict_future_prop}_use_ts_${use_action_ts_head}_use_one_${use_one_embed}_use_ms_${use_multi_scaling}_mlp_${mlp_type}_decoder_num_blocks_${decoder_num_blocks}
+#========== !NOTE! ==========#
+use_l1_regression=True
+num_images_in_input=1
+wandb_entity=chenghaha
+wandb_project=fastvla
+wandb_log_freq=1
+use_proprio=True
+use_diffusion=False
+use_film=False
+num_steps_before_decay=20000
+save_freq=5000
+max_steps=50000
+vla_path=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/ai_models/openvla/openvla-7b
+data_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/datasets/openvla/modified_libero_rlds
+dataset_name=libero_4_task_suites_no_noops
+run_root_dir=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH/results/$RUN_MODE
+#========== get run_id ==========#
+note_parts=("${MODE}")
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("L1_regression")
+# fi
+# if [ "$num_images_in_input" == 1 ]; then
+#     note_parts+=("3rd_person_img")
+# else
+#     note_parts+=("3rd_person_img_and_wrist")
+# fi
+# if [ "$use_l1_regression" = "True" ]; then
+#     note_parts+=("proprio_state")
+# fi
+# if [ "$use_film" = "True" ]; then
+#     note_parts+=("Film")
+# fi
+note_parts+=("M$max_steps-F$save_freq-D$num_steps_before_decay")
+run_id_note_value=$(IFS='--'; echo "${note_parts[*]}")
+#========== enter environment ==========#
+# conda activate openvla-oft
+cd /inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+export PYTHONPATH=/inspire/hdd/ws-f4d69b29-e0a5-44e6-bd92-acf4de9990f0/public-project/chengdongzhou-240108390137/vla_projects/$PROJECT_PATH
+#========== run ==========#
+WANDB_CONSOLE=off WANDB_MODE=offline torchrun --standalone --nnodes 1 --nproc-per-node 4 vla-scripts/finetune.py \
+  --vla_path "$vla_path" \
+  --data_root_dir "$data_root_dir" \
+  --dataset_name "$dataset_name" \
+  --run_root_dir "$run_root_dir" \
+  --use_l1_regression "$use_l1_regression" \
+  --use_diffusion "$use_diffusion" \
+  --use_film "$use_film" \
+  --num_images_in_input "$num_images_in_input" \
+  --use_proprio "$use_proprio" \
+  --batch_size "$batch_size" \
+  --learning_rate 5e-4 \
+  --num_steps_before_decay "$num_steps_before_decay" \
+  --max_steps "$max_steps" \
+  --save_freq "$save_freq" \
+  --save_latest_checkpoint_only False \
+  --image_aug True \
+  --lora_rank 32 \
+  --wandb_entity "$wandb_entity" \
+  --wandb_project "$wandb_project" \
+  --wandb_log_freq "$wandb_log_freq" \
+  --run_id_note "$run_id_note_value" \
+  --use_predict_future_prop "$use_predict_future_prop" \
+  --use_action_ts_head "$use_action_ts_head" \
+  --use_one_embed "$use_one_embed" \
+  --use_multi_scaling "$use_multi_scaling" \
+  --mlp_type "$mlp_type" \
+  --decoder_num_blocks "$decoder_num_blocks" \
+  --robot_platform "$robot_platform"

run_scripts/run.sh ADDED Viewed

	@@ -0,0 +1,35 @@

+# bash run_scripts/multiscaling/exp_multiscaling.sh
+# bash run_scripts/ffn_or_gating/gating.sh
+# bash run_scripts/ffn/ffn0.sh
+# bash run_scripts/ffn/ffn2.sh
+# bash run_scripts/ffn/ffn4.sh
+# bash run_scripts/ffn/ffn6.sh
+# bash run_scripts/ffn/ffn8.sh
+# bash run_scripts/multiscaling/2latentmsahead.sh
+# bash run_scripts/multiscaling/2msahead.sh
+# bash run_scripts/ffn/2ffn6.sh
+# bash run_scripts/all_input/2all_inputs.sh
+# bash run_scripts/ffn_or_gating/gating.sh
+# bash run_scripts/ffn/ffn0.sh
+# bash run_scripts/ffn/ffn2.sh
+# bash run_scripts/ffn/ffn4.sh
+# bash run_scripts/ffn/ffn6.sh
+# bash run_scripts/ffn/ffn8.sh
+# bash run_scripts/ffn/3ffn2.sh
+# bash run_scripts/ffn/3ffn6.sh
+# bash run_scripts/ffn/3postffn2.sh
+# bash run_scripts/ffn/3postffn6.sh
+# bash run_scripts/ffn/4ffn_withactionprojector.sh
+# bash run_scripts/ffn/4ffn6_withactionprojector.sh
+# bash run_scripts/ffn/4ffn_withactionprojector.sh
+bash run_scripts/ffn/5ffn_withactionprojector.sh
+bash run_scripts/ffn/5ffn6_withactionprojector.sh

scripts/extern/verify_prismatic.py ADDED Viewed

	@@ -0,0 +1,134 @@

+"""
+verify_prismatic.py
+Given an HF-exported Prismatic model, attempt to load via AutoClasses, and verify forward() and generate().
+"""
+import time
+import requests
+import torch
+from PIL import Image
+from transformers import AutoModelForVision2Seq, AutoProcessor
+# === Verification Arguments ===
+MODEL_PATH = "TRI-ML/prismatic-siglip-224px-7b"
+DEFAULT_IMAGE_URL = (
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png"
+)
+if "-prism-" in MODEL_PATH:
+    SAMPLE_PROMPTS_FOR_GENERATION = [
+        "In: What is sitting in the coffee?\nOut:",
+        "In: What's the name of the food on the plate?\nOut:",
+        "In: caption.\nOut:",
+        "In: how many beinets..?\nOut:",
+        "In: Can you give me a lyrical description of the scene\nOut:",
+    ]
+else:
+    SYSTEM_PROMPT = (
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    )
+    SAMPLE_PROMPTS_FOR_GENERATION = [
+        f"{SYSTEM_PROMPT} USER: What is sitting in the coffee? ASSISTANT:",
+        f"{SYSTEM_PROMPT} USER: What's the name of the food on the plate? ASSISTANT:",
+        f"{SYSTEM_PROMPT} USER: caption. ASSISTANT:",
+        f"{SYSTEM_PROMPT} USER: how many beinets..? ASSISTANT:",
+        f"{SYSTEM_PROMPT} USER: Can you give me a lyrical description of the scene ASSISTANT:",
+    ]
+@torch.inference_mode()
+def verify_prismatic() -> None:
+    print(f"[*] Verifying PrismaticForConditionalGeneration using Model `{MODEL_PATH}`")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    # Load Processor & VLM
+    print("[*] Instantiating Processor and Pretrained VLM")
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, trust_remote_code=True)
+    # === AUTOCAST MODE ===
+    # print("[*] Loading in BF16 Autocast Mode")
+    # vlm = AutoModelForVision2Seq.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True, trust_remote_code=True).to(
+    #     device, dtype=torch.bfloat16
+    # )
+    # === NATIVE BFLOAT16 MODE ===
+    # print("[*] Loading in BF16")
+    # vlm = AutoModelForVision2Seq.from_pretrained(
+    #     MODEL_PATH, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True
+    # ).to(device)
+    # === BFLOAT16 + FLASH-ATTN MODE :: [~14GB of VRAM Passive || 18GB of VRAM Active] ===
+    print("[*] Loading in BF16 with Flash-Attention Enabled")
+    vlm = AutoModelForVision2Seq.from_pretrained(
+        MODEL_PATH,
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.bfloat16,
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    ).to(device)
+    # === 8-BIT QUANTIZATION MODE (`pip install bitsandbytes`) :: [~9GB of VRAM Passive || 10GB of VRAM Active] ===
+    # print("[*] Loading in 8-Bit Quantization Mode")
+    # vlm = AutoModelForVision2Seq.from_pretrained(
+    #     MODEL_PATH,
+    #     attn_implementation="flash_attention_2",
+    #     torch_dtype=torch.float16,
+    #     quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+    #     low_cpu_mem_usage=True,
+    #     trust_remote_code=True,
+    # )
+    # === 4-BIT QUANTIZATION MODE (`pip install bitsandbytes`) :: [~6GB of VRAM Passive || 7GB of VRAM Active] ===
+    # print("[*] Loading in 4-Bit Quantization Mode")
+    # vlm = AutoModelForVision2Seq.from_pretrained(
+    #     MODEL_PATH,
+    #     attn_implementation="flash_attention_2",
+    #     torch_dtype=torch.float16,
+    #     quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+    #     low_cpu_mem_usage=True,
+    #     trust_remote_code=True,
+    # )
+    # Iterate over Sample Prompts =>> Generate
+    image = Image.open(requests.get(DEFAULT_IMAGE_URL, stream=True).raw).convert("RGB")
+    num_tokens, total_time = 0, 0.0
+    print("[*] Iterating over Sample Prompts\n===\n")
+    for idx, prompt in enumerate(SAMPLE_PROMPTS_FOR_GENERATION):
+        # === AUTOCAST MODE (Reproduces Prismatic `scripts/generate.py`) ===
+        # inputs = processor(prompt, image).to(device)
+        #
+        # # Using "autocast" to evaluate bit-wise equivalence to `scripts/generate.py`
+        # #   =>> Running in native BF16 is also fine (but leads to slightly different generations)
+        # with torch.autocast("cuda", dtype=torch.bfloat16, enabled=True):
+        #     gen_ids = vlm.generate(**inputs, do_sample=False, min_length=1, max_length=512)
+        # === BFLOAT16 MODE ===
+        inputs = processor(prompt, image).to(device, dtype=torch.bfloat16)
+        # === 8-BIT/4-BIT QUANTIZATION MODE ===
+        # inputs = processor(prompt, image).to(device, dtype=torch.float16)
+        # Run Inference
+        gen_ids = None
+        for _ in range(5):
+            start_time = time.time()
+            gen_ids = vlm.generate(**inputs, do_sample=False, min_length=1, max_length=512)
+            total_time += time.time() - start_time
+            gen_ids = gen_ids[0, inputs.input_ids.shape[1] :]
+            num_tokens += len(gen_ids)
+        # ===
+        gen_text = processor.decode(gen_ids, skip_special_tokens=True).strip()
+        print(f"[{idx + 1}] Input Prompt => {prompt}\n    Generated    => {gen_text}\n")
+    # Compute Tokens / Second
+    print(f"[*] Generated Tokens per Second = {num_tokens / total_time} w/ {num_tokens = } and {total_time = }")
+if __name__ == "__main__":
+    verify_prismatic()

scripts/pretrain.py ADDED Viewed

	@@ -0,0 +1,238 @@

+"""
+pretrain.py
+Pretraining script for Prismatic VLM pretraining in native PyTorch, using Fully-Sharded Data Parallel (FSDP) to run
+distributed training across GPUs. By default, assumes that CUDA toolkit is >= 11.0 (to support BF16 mixed precision).
+Notes & Prerequisites:
+    - We're loading LLaMa-2 (and possibly other) gated models from HuggingFace (HF Hub); these require an auth_token.
+      For LLaMa-2, make sure to first get Meta approval, then fill out the form at the top of the HF LLaMa-2 page:
+        => Link: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf
+        => Generate Token (from `huggingface.co`): Settings / Access Tokens / New "Read" Token
+        => Set `cfg.hf_token` to file path with token (as single line text file) or environment variable name
+    - If you want to set a custom location for all HF / TIMM artifacts --> `export HF_HOME="<PATH>"` *before* running!
+        => For example (add to end of .bashrc): `export HF_HOME="/mnt/fsx/skaramcheti/cache"`
+Run with:
+    - [Single Node One-GPU (Debug)] : torchrun --standalone --nnodes 1 --nproc-per-node 1 scripts/pretrain.py
+    - [Single Node Multi-GPU (= $K)]: torchrun --standalone --nnodes 1 --nproc-per-node $K scripts/pretrain.py
+    - [Multi-Node/AWS Sagemaker] Depends on your individual setup; file an issue if you have trouble!
+"""
+import json
+import os
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, Tuple, Union
+import draccus
+import torch
+import torch.distributed as dist
+import yaml
+from prismatic.conf import DatasetConfig, DatasetRegistry, ModelConfig, ModelRegistry
+from prismatic.models import get_llm_backbone_and_tokenizer, get_vision_backbone_and_transform, get_vlm
+from prismatic.overwatch import initialize_overwatch
+from prismatic.preprocessing import get_dataset_and_collator
+from prismatic.training import Metrics, get_train_strategy
+from prismatic.util import set_global_seed
+# Disable Tokenizers Parallelism to Play Nice w/ PyTorch Multiprocessing DataLoaders
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+# Initialize Overwatch =>> Wraps `logging.Logger`
+overwatch = initialize_overwatch(__name__)
+@dataclass
+class PretrainConfig:
+    # fmt: off
+    # ModelConfig (`prismatic/conf/models.py`); override with --model.type `ModelRegistry.<MODEL>.model_id`
+    model: ModelConfig = field(
+        default_factory=ModelConfig.get_choice_class(ModelRegistry.PRISM_DINOSIGLIP_CONTROLLED_7B.model_id)
+    )
+    # DatasetConfig (`prismatic/conf/datasets.py`); override with --dataset.type `DatasetRegistry.<DATASET>.dataset_id`
+    dataset: DatasetConfig = field(
+        default_factory=DatasetConfig.get_choice_class(DatasetRegistry.LLAVA_V15.dataset_id)
+    )
+    # Pretraining Stage in < align (projector-only) | finetune (projector + LLM) | full-finetune (all) >
+    # ---
+    stage: str = "finetune"                                         # Pretraining Stage in < align | finetune >
+    pretrained_checkpoint: Optional[Path] = None                    # Pretrained Checkpoint to Load (for `finetune`)
+                                                                    #   if None =>> will match on (run_dir / `align`)
+    # Run Arguments
+    run_id: Optional[str] = None                                    # Run ID for logging, Weights & Biases
+    run_root_dir: Path = Path("/mnt/fsx/x-prismatic-vlms/runs")     # Path to directory to store logs & checkpoints
+    seed: int = 7                                                   # Random seed (for reproducibility)
+    # HF Hub Credentials (for any gated models)
+    hf_token: Union[str, Path] = Path(".hf_token")                  # Environment variable or Path to HF Token
+    # Tracking Parameters
+    trackers: Tuple[str, ...] = ("jsonl", "wandb")                  # Trackers to initialize (if W&B, add config!)
+    wandb_project: str = "onyx-vlms"                                # Name of W&B project (default: `prismatic`)
+    wandb_entity: Optional[str] = "stanford-voltron"                # Name of W&B entity (default: None)
+    def __post_init__(self) -> None:
+        """Set optimization parameters based on `stage` in {"align", "finetune"}."""
+        if self.stage == "align":
+            self.epochs = self.model.align_epochs
+            self.max_steps = self.model.align_max_steps
+            self.global_batch_size = self.model.align_global_batch_size
+            self.per_device_batch_size = self.model.align_per_device_batch_size
+            self.learning_rate = self.model.align_learning_rate
+            self.weight_decay = self.model.align_weight_decay
+            self.max_grad_norm = self.model.align_max_grad_norm
+            self.lr_scheduler_type = self.model.align_lr_scheduler_type
+            self.warmup_ratio = self.model.align_warmup_ratio
+            self.train_strategy = self.model.align_train_strategy
+        elif self.stage.endswith("finetune"):
+            self.epochs = self.model.finetune_epochs
+            self.max_steps = self.model.finetune_max_steps
+            self.global_batch_size = self.model.finetune_global_batch_size
+            self.per_device_batch_size = self.model.finetune_per_device_batch_size
+            self.learning_rate = self.model.finetune_learning_rate
+            self.weight_decay = self.model.finetune_weight_decay
+            self.max_grad_norm = self.model.finetune_max_grad_norm
+            self.lr_scheduler_type = self.model.finetune_lr_scheduler_type
+            self.warmup_ratio = self.model.finetune_warmup_ratio
+            self.train_strategy = self.model.finetune_train_strategy
+        else:
+            raise ValueError(f"Stage `{self.stage}` is not supported!")
+    # fmt: on
+@draccus.wrap()
+def pretrain(cfg: PretrainConfig) -> None:
+    overwatch.info("Prismatic VLM Training :: Gathering Light")
+    # Note => Under `torchrun` initializing `overwatch` will automatically set up `torch.distributed`
+    torch.cuda.set_device(device_id := overwatch.local_rank())
+    torch.cuda.empty_cache()
+    # Create Unique Run Name & Save Directory
+    model_id = cfg.model.model_id
+    if (dataset_id := cfg.dataset.dataset_id) == "llava-v15":
+        cfg.run_id = f"{model_id}+stage-{cfg.stage}+x{cfg.seed}" if cfg.run_id is None else cfg.run_id
+    else:
+        cfg.run_id = f"{dataset_id}+{model_id}+stage-{cfg.stage}+x{cfg.seed}" if cfg.run_id is None else cfg.run_id
+    # Start =>> Build Directories and Set Randomness
+    overwatch.info('"Life is like a prism; what you see depends on how you turn the glass."', ctx_level=1)
+    hf_token = cfg.hf_token.read_text().strip() if isinstance(cfg.hf_token, Path) else os.environ[cfg.hf_token]
+    worker_init_fn = set_global_seed(cfg.seed, get_worker_init_fn=True)
+    os.makedirs(run_dir := (cfg.run_root_dir / cfg.run_id), exist_ok=True)
+    os.makedirs(cfg.run_root_dir / cfg.run_id / "checkpoints", exist_ok=True)
+    if overwatch.is_rank_zero():
+        # Additionally save a JSON version of the config
+        draccus.dump(cfg, open(run_dir / "config.yaml", "w"))
+        with open(run_dir / "config.yaml", "r") as f_yaml, open(run_dir / "config.json", "w") as f_json:
+            yaml_cfg = yaml.safe_load(f_yaml)
+            json.dump(yaml_cfg, f_json, indent=2)
+    # Load Vision Backbone --> on CPU, in Full Precision (initializing model, image_transform via TIMM)
+    overwatch.info(f"Loading Vision Backbone [bold]{cfg.model.vision_backbone_id}[/] via TIMM ")
+    vision_backbone, image_transform = get_vision_backbone_and_transform(
+        cfg.model.vision_backbone_id, image_resize_strategy=cfg.model.image_resize_strategy
+    )
+    # Load LLM Backbone --> on CPU, in Full Precision (initializing Tokenizer + handling special tokens if necessary)
+    overwatch.info(f"Loading Pretrained LLM [bold]{cfg.model.llm_backbone_id}[/] via HF Transformers")
+    llm_backbone, tokenizer = get_llm_backbone_and_tokenizer(
+        cfg.model.llm_backbone_id, llm_max_length=cfg.model.llm_max_length, hf_token=hf_token
+    )
+    # Create VLM => wraps `vision_backbone` and `llm`
+    overwatch.info(f"Instantiating PrismaticVLM `{model_id}` for Training Stage = `{cfg.stage}`")
+    vlm = get_vlm(
+        model_id,
+        cfg.model.arch_specifier,
+        vision_backbone,
+        llm_backbone,
+        enable_mixed_precision_training=cfg.model.enable_mixed_precision_training,
+    )
+    # [Explicit] Call to `freeze_backbones` here for clarity => will log exactly what is frozen / what's not!
+    overwatch.info(f"Invoking `VLM.freeze_backbones()` for `{model_id}` => Training Stage: `{cfg.stage}`")
+    vlm.freeze_backbones(cfg.stage)
+    # Load Weights from Checkpoint (depends on stage, config)
+    overwatch.info(f"Invoking `VLM.load_checkpoint()` for `{model_id}` => Training Stage: `{cfg.stage}`")
+    vlm.load_from_checkpoint(cfg.stage, run_dir, pretrained_checkpoint=cfg.pretrained_checkpoint)
+    # Get Dataset for Specified Stage
+    overwatch.info(f"Creating Dataset `{cfg.dataset.dataset_id}` => Stage: `{cfg.stage}`")
+    train_dataset, collator = get_dataset_and_collator(
+        cfg.stage,
+        cfg.dataset,
+        image_transform,
+        tokenizer,
+        prompt_builder_fn=llm_backbone.prompt_builder_fn,
+        default_image_resolution=vision_backbone.default_image_resolution,
+        padding_side=tokenizer.padding_side,
+    )
+    # Create Train Strategy
+    overwatch.info(f"Initializing Train Strategy `{cfg.train_strategy}`")
+    train_strategy = get_train_strategy(
+        train_strategy=cfg.train_strategy,
+        vlm=vlm,
+        device_id=device_id,
+        stage=cfg.stage,
+        epochs=cfg.epochs,
+        max_steps=cfg.max_steps,
+        global_batch_size=cfg.global_batch_size,
+        per_device_batch_size=cfg.per_device_batch_size,
+        learning_rate=cfg.learning_rate,
+        weight_decay=cfg.weight_decay,
+        max_grad_norm=cfg.max_grad_norm,
+        lr_scheduler_type=cfg.lr_scheduler_type,
+        warmup_ratio=cfg.warmup_ratio,
+        enable_gradient_checkpointing=cfg.model.enable_gradient_checkpointing,
+        enable_mixed_precision_training=cfg.model.enable_mixed_precision_training,
+        reduce_in_full_precision=cfg.model.reduce_in_full_precision,
+        worker_init_fn=worker_init_fn,
+    )
+    train_strategy.run_setup(run_dir=run_dir, n_train_examples=len(train_dataset))
+    # Create Metrics =>> Handles on the fly tracking, logging to specified trackers (e.g., JSONL, Weights & Biases)
+    overwatch.info(f"Creating Metrics with Active Trackers => `{cfg.trackers}`")
+    metrics = Metrics(
+        cfg.trackers,
+        cfg.run_id,
+        run_dir,
+        draccus.encode(cfg),
+        cfg.stage,
+        wandb_project=cfg.wandb_project,
+        wandb_entity=cfg.wandb_entity,
+        grad_accumulation_steps=train_strategy.grad_accumulation_steps,
+    )
+    # Run Training
+    overwatch.info("Starting Training Loop")
+    train_strategy.run_training(train_dataset, collator, metrics, stage=cfg.stage, seed=cfg.seed)
+    # Finalize
+    overwatch.info("Done with Training =>> Finalizing Metrics")
+    metrics.finalize()
+    # And... we're done!
+    overwatch.info("... and that's all, folks!")
+    dist.barrier()
+    dist.destroy_process_group()
+if __name__ == "__main__":
+    pretrain()

test_deepseek_moe.py ADDED Viewed

	@@ -0,0 +1,246 @@

+import torch
+import torch.nn as nn
+import sys
+import os
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+# 模拟常量定义
+ACTION_DIM = 7
+NUM_ACTIONS_CHUNK = 8
+SHORT_NUM_ACTIONS_CHUNK = 4
+MID_NUM_ACTIONS_CHUNK = 6
+# 导入相关模块
+from prismatic.models.action_heads import (
+    Expert,
+    DeepSeekV3AdaptiveBiasRouter,
+    MoELayer,
+    DeepSeekV3MoEActionHead,
+    TSActionHead
+)
+def test_deepseek_moe_components():
+    """测试DeepSeek V3 MoE组件"""
+    print("测试 DeepSeek V3 MoE 组件...")
+    # 测试参数
+    batch_size = 4
+    seq_len = 8
+    hidden_dim = 256
+    num_experts = 8
+    top_k = 2
+    # 创建测试数据
+    x = torch.randn(batch_size, seq_len, hidden_dim)
+    print("\n1. 测试 GELU Expert 网络:")
+    try:
+        expert = Expert(hidden_dim)
+        output = expert(x.view(-1, hidden_dim))
+        print(f"  输入形状: {x.view(-1, hidden_dim).shape}")
+        print(f"  输出形状: {output.shape}")
+        assert output.shape == (batch_size * seq_len, hidden_dim)
+        # 验证使用了GELU激活
+        print(f"  激活函数类型: {type(expert.activation).__name__}")
+        assert isinstance(expert.activation, nn.GELU)
+        print("  ✓ GELU Expert 网络测试通过")
+    except Exception as e:
+        print(f"  ✗ GELU Expert 网络测试失败: {e}")
+    print("\n2. 测试 DeepSeek V3 自适应偏置路由器:")
+    try:
+        router = DeepSeekV3AdaptiveBiasRouter(hidden_dim, num_experts, top_k)
+        weights, indices = router(x)
+        print(f"  输入形状: {x.shape}")
+        print(f"  权重形状: {weights.shape}")
+        print(f"  索引形状: {indices.shape}")
+        assert weights.shape == (batch_size, seq_len, top_k)
+        assert indices.shape == (batch_size, seq_len, top_k)
+        # 验证路由器有自适应偏置
+        if router.enable_bias_correction:
+            print(f"  自适应偏置形状: {router.adaptive_bias.shape}")
+            assert router.adaptive_bias.shape == (num_experts,)
+        # 验证负载均衡损失
+        loss = router.get_load_balancing_loss()
+        print(f"  负载均衡损失: {loss.item():.6f}")
+        print("  ✓ DeepSeek V3 路由器测试通过")
+    except Exception as e:
+        print(f"  ✗ DeepSeek V3 路由器测试失败: {e}")
+    print("\n3. 测试 DeepSeek V3 MoE层:")
+    try:
+        # 测试不带共享专家的版本
+        moe_layer = MoELayer(
+            hidden_dim,
+            num_experts,
+            top_k,
+            enable_shared_expert=False
+        )
+        output = moe_layer(x)
+        print(f"  输入形状: {x.shape}")
+        print(f"  输出形状: {output.shape}")
+        assert output.shape == x.shape
+        # 测试带共享专家的版本
+        moe_layer_shared = MoELayer(
+            hidden_dim,
+            num_experts,
+            top_k,
+            enable_shared_expert=True,
+            num_shared_experts=2
+        )
+        output_shared = moe_layer_shared(x)
+        print(f"  带共享专家输出形状: {output_shared.shape}")
+        assert output_shared.shape == x.shape
+        # 验证负载均衡
+        load_loss = moe_layer.get_load_balancing_loss()
+        print(f"  负载均衡损失: {load_loss.item():.6f}")
+        print("  ✓ DeepSeek V3 MoE层测试通过")
+    except Exception as e:
+        print(f"  ✗ DeepSeek V3 MoE层测试失败: {e}")
+def test_deepseek_moe_action_head():
+    """测试DeepSeek V3 MoE动作头"""
+    print("\n4. 测试 DeepSeek V3 MoE 动作头:")
+    # 测试参数
+    batch_size = 2
+    input_dim = 512
+    hidden_dim = 256
+    action_dim = 7
+    try:
+        # 创建模型
+        model = DeepSeekV3MoEActionHead(
+            input_dim=input_dim,
+            hidden_dim=hidden_dim,
+            action_dim=action_dim,
+            num_routed_experts=8,
+            top_k=2,
+            num_moe_layers=2,
+            enable_shared_expert=True
+        )
+        # 测试单token输入
+        actions_hidden_states_single = torch.randn(batch_size, 1, input_dim)
+        output_single = model.predict_action(actions_hidden_states_single)
+        print(f"  单token输入形状: {actions_hidden_states_single.shape}")
+        print(f"  单token输出形状: {output_single.shape}")
+        assert output_single.shape == (batch_size, NUM_ACTIONS_CHUNK, action_dim)
+        # 测试多token输入
+        actions_hidden_states_multi = torch.randn(batch_size, ACTION_DIM, input_dim)
+        output_multi = model.predict_action(actions_hidden_states_multi)
+        print(f"  多token输入形状: {actions_hidden_states_multi.shape}")
+        print(f"  多token输出形状: {output_multi.shape}")
+        assert output_multi.shape == (batch_size, NUM_ACTIONS_CHUNK, action_dim)
+        # 测试负载均衡损失
+        load_loss = model.get_load_balancing_loss()
+        print(f"  模型负载均衡损失: {load_loss.item():.6f}")
+        # 测试专家使用统计
+        model.train()
+        _ = model.predict_action(actions_hidden_states_single)  # 触发统计更新
+        stats = model.get_expert_usage_stats()
+        print(f"  专家使用统计层数: {len(stats)}")
+        print("  ✓ DeepSeek V3 MoE 动作头测试通过")
+    except Exception as e:
+        print(f"  ✗ DeepSeek V3 MoE 动作头测试失败: {e}")
+def test_comparison_with_traditional_methods():
+    """比较DeepSeek V3 MoE与传统方法"""
+    print("\n5. 性能比较测试:")
+    # 测试参数
+    batch_size = 2
+    input_dim = 512
+    hidden_dim = 256
+    action_dim = 7
+    try:
+        # 传统FFN方法
+        model_ffn = TSActionHead(
+            input_dim=input_dim,
+            hidden_dim=hidden_dim,
+            action_dim=action_dim,
+            mlp_type='ffn',
+            decoder_num_blocks=2
+        )
+        # 旧版MoE方法
+        model_old_moe = TSActionHead(
+            input_dim=input_dim,
+            hidden_dim=hidden_dim,
+            action_dim=action_dim,
+            mlp_type='moe',
+            num_experts=8,
+            top_k=2,
+            decoder_num_blocks=2
+        )
+        # DeepSeek V3 MoE方法
+        model_deepseek_moe = DeepSeekV3MoEActionHead(
+            input_dim=input_dim,
+            hidden_dim=hidden_dim,
+            action_dim=action_dim,
+            num_routed_experts=8,
+            top_k=2,
+            num_moe_layers=2,
+            enable_shared_expert=True
+        )
+        # 计算参数量
+        params_ffn = sum(p.numel() for p in model_ffn.parameters())
+        params_old_moe = sum(p.numel() for p in model_old_moe.parameters())
+        params_deepseek_moe = sum(p.numel() for p in model_deepseek_moe.parameters())
+        print(f"  FFN 模型参数量: {params_ffn:,}")
+        print(f"  旧版 MoE 参数量: {params_old_moe:,}")
+        print(f"  DeepSeek V3 MoE 参数量: {params_deepseek_moe:,}")
+        print(f"  DeepSeek V3 vs FFN 参数比例: {params_deepseek_moe / params_ffn:.2f}x")
+        print(f"  DeepSeek V3 vs 旧版MoE 参数比例: {params_deepseek_moe / params_old_moe:.2f}x")
+        # 测试推理时间（简单测试）
+        import time
+        test_input = torch.randn(batch_size, 1, input_dim)
+        # FFN推理时间
+        start_time = time.time()
+        for _ in range(100):
+            _ = model_ffn.predict_action(test_input)
+        ffn_time = time.time() - start_time
+        # DeepSeek V3 MoE推理时间
+        start_time = time.time()
+        for _ in range(100):
+            _ = model_deepseek_moe.predict_action(test_input)
+        deepseek_time = time.time() - start_time
+        print(f"  FFN 推理时间 (100次): {ffn_time:.4f}s")
+        print(f"  DeepSeek V3 MoE 推理时间 (100次): {deepseek_time:.4f}s")
+        print(f"  推理时间比例: {deepseek_time / ffn_time:.2f}x")
+        print("  ✓ 性能比较测试完成")
+    except Exception as e:
+        print(f"  ✗ 性能比较测试失败: {e}")
+if __name__ == "__main__":
+    test_deepseek_moe_components()
+    test_deepseek_moe_action_head()
+    test_comparison_with_traditional_methods()
+    print("\n所有 DeepSeek V3 MoE 测试完成!")

vla-scripts/extern/convert_openvla_weights_to_hf.py ADDED Viewed

	@@ -0,0 +1,272 @@

+"""
+convert_openvla_weights_to_hf.py
+Utility script for converting full OpenVLA VLA weights (from this repository, in the default "Prismatic" format) to
+the HuggingFace "AutoClasses" (e.g., those defined in `prismatic.extern.hf_*`) for "native" use in `transformers``
+via `trust_remote_code = True`.
+Theoretically, these changes should be fully compatible with directly merging the models into `transformers` down the
+line, with first-class support.
+Usage:
+    python vla-scripts/extern/convert_openvla_weights_to_hf.py \
+        --openvla_model_path_or_id <PATH TO PRISMATIC TRAINING RUN DIR> \
+        --output_hf_model_local_path <OUTPUT DIR FOR CONVERTED CHECKPOINT>
+"""
+import json
+import os
+import shutil
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Dict, Union
+import draccus
+import timm
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from timm.models.vision_transformer import LayerScale
+from transformers import AutoTokenizer
+from prismatic.conf import ModelConfig
+from prismatic.extern.hf.configuration_prismatic import OpenVLAConfig
+from prismatic.extern.hf.modeling_prismatic import OpenVLAForActionPrediction
+from prismatic.extern.hf.processing_prismatic import PrismaticImageProcessor, PrismaticProcessor
+@dataclass
+class HFConvertConfig:
+    # fmt: off
+    openvla_model_path_or_id: Union[str, Path] = (                      # Path to Pretrained VLA (on disk or HF Hub)
+        "runs/prism-dinosiglip-224px+mx-oxe-magic-soup-plus+n8+b32+x7"
+    )
+    output_hf_model_local_path: Path = Path(                            # Path to Local Path to save HF model
+        "hf-convert/openvla-7b"
+    )
+    output_hf_model_hub_path: str = "openvla/openvla-7b"                # (Optional) Path to HF Hub Path to push
+                                                                        # model to
+    # HF Hub Credentials (required for Gated Models like LLaMa-2)
+    hf_token: Union[str, Path] = Path(".hf_token")                      # Environment variable or Path to HF Token
+    def __post_init__(self) -> None:
+        self.hf_token = self.hf_token.read_text().strip() if isinstance(self.hf_token, Path) else self.hf_token
+    # fmt: on
+# HF Transformers overwrites parameters with names containing `gamma`; we're going to patch VisionBackbone.LayerScale.
+#   =>> TIMM :: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L109
+#   =>> Transformers :: https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py#L3960
+def _ls_new_forward(self, x: torch.Tensor) -> torch.Tensor:
+    return x.mul_(self.scale_factor) if self.inplace else x * self.scale_factor
+def ls_apply_patch(ls_module: LayerScale):
+    ls_module.scale_factor = nn.Parameter(ls_module.gamma.clone())
+    ls_module.forward = _ls_new_forward.__get__(ls_module, LayerScale)
+    del ls_module.gamma
+# === Conversion Constants ===
+PROJECTOR_KEY_MAPPING = {
+    "projector.0.weight": "projector.fc1.weight",
+    "projector.0.bias": "projector.fc1.bias",
+    "projector.2.weight": "projector.fc2.weight",
+    "projector.2.bias": "projector.fc2.bias",
+    "projector.4.weight": "projector.fc3.weight",
+    "projector.4.bias": "projector.fc3.bias",
+}
+def remap_state_dicts_for_hf(
+    prismatic_vision_backbone_state_dict: Dict[str, torch.Tensor],
+    projector_state_dict: Dict[str, torch.Tensor],
+    llm_backbone_state_dict: Dict[str, torch.Tensor],
+    use_fused_vision_backbone: bool = False,
+) -> Dict[str, torch.Tensor]:
+    """Iterate through Prismatic component state dictionaries and unify / fix key mapping for HF conversion."""
+    hf_state_dict = {}
+    # Iterate through Projector =>> use `PROJECTOR_KEY_MAPPING`
+    for key, value in projector_state_dict.items():
+        hf_state_dict[PROJECTOR_KEY_MAPPING[key]] = value
+    # Iterate through LLM Backbone =>> replace `llm.` with `language_model.`
+    for key, value in llm_backbone_state_dict.items():
+        hf_state_dict[key.replace("llm.", "language_model.")] = value
+    # Iterate through Vision Backbone =>> add "vision_backbone." prefix
+    if not use_fused_vision_backbone:
+        for key, value in prismatic_vision_backbone_state_dict.items():
+            hf_state_dict[key.replace("featurizer.", "vision_backbone.featurizer.")] = value
+    else:
+        # Note =>> Assumes that backbones are always DINO + SigLIP...
+        for key, value in prismatic_vision_backbone_state_dict.items():
+            if key.startswith("dino_featurizer"):
+                if key.endswith(".gamma"):
+                    # Handle `LayerScale gamma` =>> DINOv2 only!
+                    key = key.replace(".gamma", ".scale_factor")
+                hf_state_dict[key.replace("dino_featurizer.", "vision_backbone.featurizer.")] = value
+            elif key.startswith("siglip_featurizer"):
+                hf_state_dict[key.replace("siglip_featurizer.", "vision_backbone.fused_featurizer.")] = value
+    return hf_state_dict
+@draccus.wrap()
+def convert_openvla_weights_to_hf(cfg: HFConvertConfig) -> None:
+    print(f"[*] Converting OpenVLA Model `{cfg.openvla_model_path_or_id}` to HF Transformers Format")
+    torch.set_default_dtype(torch.bfloat16)
+    # Get `config.json`, 'dataset_statistics.json' and `checkpoint_pt` -- mirrors logic in `prismatic.models.load.py`
+    if os.path.isdir(cfg.openvla_model_path_or_id):
+        print(f"[*] Loading from Local Path `{(run_dir := Path(cfg.openvla_model_path_or_id))}`")
+        config_json, checkpoint_pt = run_dir / "config.json", run_dir / "checkpoints" / "latest-checkpoint.pt"
+        dataset_statistics_json = run_dir / "dataset_statistics.json"
+        assert config_json.exists(), f"Missing `config.json` for `{run_dir = }`"
+        assert checkpoint_pt.exists(), f"Missing checkpoint for `{run_dir = }`"
+        assert dataset_statistics_json.exists(), f"Missing `dataset_statistics.json` for `{run_dir = }`"
+    else:
+        print(f"[*] Downloading Prismatic Checkpoint from HF Hub :: `TRI-ML/{cfg.openvla_model_path_or_id}`")
+        config_json = hf_hub_download("openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/config.json")
+        checkpoint_pt = hf_hub_download(
+            "openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/checkpoints/latest-checkpoint.pt"
+        )
+        dataset_statistics_json = hf_hub_download(
+            "openvla/openvla-dev", f"{cfg.openvla_model_path_or_id}/dataset_statistics.json"
+        )
+    # Load "Native" Config JSON =>> Create LLM Config & Instantiate Tokenizer
+    with open(config_json, "r") as f:
+        vla_cfg = json.load(f)["vla"]
+        prismatic_config = ModelConfig.get_choice_class(vla_cfg["base_vlm"])().__dict__
+    # Load Normalization Statistics
+    with open(dataset_statistics_json, "r") as f:
+        norm_stats = json.load(f)
+    # Create HF OpenVLAConfig (`transformers.PretrainedConfig`)
+    hf_config = OpenVLAConfig(
+        vision_backbone_id=prismatic_config["vision_backbone_id"],
+        llm_backbone_id=prismatic_config["llm_backbone_id"],
+        arch_specifier=prismatic_config["arch_specifier"],
+        image_resize_strategy=prismatic_config["image_resize_strategy"],
+        llm_max_length=prismatic_config["llm_max_length"],
+        torch_dtype=torch.bfloat16,
+        norm_stats=norm_stats,
+    )
+    # Instantiate & Add Pad to Tokenizer =>> following `prismatic.models.materialize.get_llm_backbone_and_tokenizer`
+    #   TODO (siddk) :: Implement batched generation -- in which case this should set `padding_side = "left"`!
+    print("[*] Instantiating and Patching Tokenizer, LLM Config")
+    tokenizer = AutoTokenizer.from_pretrained(
+        hf_config.hf_llm_id, model_max_length=hf_config.llm_max_length, token=cfg.hf_token, padding_side="right"
+    )
+    tokenizer.add_special_tokens({"pad_token": "<PAD>"})
+    tokenizer.init_kwargs.pop("add_prefix_space", None)  # Pop to prevent unnecessary warning on reload...
+    assert tokenizer.pad_token_id == hf_config.pad_token_id, "Incorrect Pad Token ID!"
+    assert len(tokenizer) > hf_config.text_config.vocab_size, "Tokenizer vocabulary must be larger than LLM vocabulary!"
+    # Patch LLM Config in `hf_config` with vocab_size (+ `hf_config.pad_to_multiple_of`), pad_token_id + validate
+    hf_config.text_config.vocab_size += hf_config.pad_to_multiple_of
+    hf_config.text_config.pad_token_id = hf_config.pad_token_id
+    hf_config.text_config.torch_dtype = torch.bfloat16
+    assert hf_config.text_config.use_cache, "LLM config `use_cache` should be True for inference (set default)!"
+    # Create Vision Backbone & Transform =>> following `prismatic.models.materialize.get_vision_backbone_and_transform`
+    #   =>> Deviates a bit from existing code; as such, explicitly tested in `tests/test_image_transforms.py`
+    print("[*] Loading TIMM Vision Backbone(s) and Image Transform(s) =>> Initializing PrismaticImageProcessor")
+    input_sizes, interpolations, means, stds = [], [], [], []
+    for idx, timm_model_id in enumerate(hf_config.timm_model_ids):
+        timm_vision_backbone = timm.create_model(
+            timm_model_id,
+            pretrained=True,
+            num_classes=0,
+            img_size=hf_config.image_sizes[idx],
+            act_layer=hf_config.timm_override_act_layers[idx],
+        )
+        # Get Per-Backbone Image Processing
+        data_cfg = timm.data.resolve_model_data_config(timm_vision_backbone)
+        input_sizes.append((3, hf_config.image_sizes[idx], hf_config.image_sizes[idx]))
+        interpolations.append(data_cfg["interpolation"])
+        means.append(data_cfg["mean"])
+        stds.append(data_cfg["std"])
+        # Patch `LayerScale` because of HF annoying `fix_key` overwrite...
+        for module in timm_vision_backbone.modules():
+            if isinstance(module, LayerScale):
+                ls_apply_patch(module)
+    # Create PrismaticImageProcessor (`transformers.ImageProcessingMixin`)
+    hf_image_processor = PrismaticImageProcessor(
+        use_fused_vision_backbone=hf_config.use_fused_vision_backbone,
+        image_resize_strategy=hf_config.image_resize_strategy,
+        input_sizes=input_sizes,
+        interpolations=interpolations,
+        means=means,
+        stds=stds,
+    )
+    # Create top-level PrismaticProcessor (`transformers.ProcessorMixin` =>> enables registry w/ AutoProcessor)
+    print("[*] Creating PrismaticProcessor Instance from Tokenizer and PrismaticImageProcessor")
+    hf_processor = PrismaticProcessor(image_processor=hf_image_processor, tokenizer=tokenizer)
+    # Load Prismatic Model State Dictionary (in preparation for conversion)
+    print("[*] Loading Prismatic VLM State Dictionary from Checkpoint")
+    model_state_dict = torch.load(checkpoint_pt, map_location="cpu")["model"]
+    assert ("downsampler" not in model_state_dict) or (len(model_state_dict["downsampler"]) == 0), "Downsampler?"
+    assert all([k in model_state_dict for k in ["vision_backbone", "projector", "llm_backbone"]]), "Missing keys!"
+    # Convert
+    print("[*] Running Conversion")
+    converted_state_dict = remap_state_dicts_for_hf(
+        model_state_dict["vision_backbone"],
+        model_state_dict["projector"],
+        model_state_dict["llm_backbone"],
+        use_fused_vision_backbone=hf_config.use_fused_vision_backbone,
+    )
+    # Create PrismaticForConditionalGeneration =>> Note that we can't initialize on `meta` device because TIMM
+    print("[*] Building (Randomly Initialized) Model =>> OpenVLAForActionPrediction")
+    hf_model = OpenVLAForActionPrediction(hf_config)
+    hf_model.load_state_dict(converted_state_dict, strict=True, assign=True)
+    # Cast Model to BF16 before Saving
+    hf_model.to(torch.bfloat16)
+    # Save Pretrained Versions to Local Path
+    print("[*] Saving Model & Processor to Local Path")
+    hf_model.save_pretrained(cfg.output_hf_model_local_path, max_shard_size="7GB")
+    hf_image_processor.save_pretrained(cfg.output_hf_model_local_path)
+    hf_processor.save_pretrained(cfg.output_hf_model_local_path)
+    # Copy `dataset_statistics.json` File to Converted Checkpoint Directory
+    output_dataset_statistics_json = cfg.output_hf_model_local_path / "dataset_statistics.json"
+    shutil.copyfile(dataset_statistics_json, output_dataset_statistics_json)
+    print(f"[*] Saving Complete! Saved converted checkpoint to: {cfg.output_hf_model_local_path}")
+    #####################################################################################
+    # Optional: Push Model to Hugging Face Hub
+    #####################################################################################
+    # # Register AutoClasses
+    # OpenVLAConfig.register_for_auto_class()
+    # PrismaticImageProcessor.register_for_auto_class("AutoImageProcessor")
+    # PrismaticProcessor.register_for_auto_class("AutoProcessor")
+    # OpenVLAForActionPrediction.register_for_auto_class("AutoModelForVision2Seq")
+    # # Push to HF Hub
+    # print("[*] Pushing Model & Processor to HF Hub")
+    # hf_config.push_to_hub(cfg.output_hf_model_hub_path)
+    # hf_model.push_to_hub(cfg.output_hf_model_hub_path, max_shard_size="7GB")
+    # hf_image_processor.push_to_hub(cfg.output_hf_model_hub_path)
+    # hf_processor.push_to_hub(cfg.output_hf_model_hub_path)
+if __name__ == "__main__":
+    convert_openvla_weights_to_hf()