Spaces:

theaiinstitute
/

theia

Running on Zero

App Files Files

bmay commited on Aug 1, 2024

Commit

26791f7

1 Parent(s): 77b08da

Add theia

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

LICENSE +24 -0
theia/__init__.py +1 -0
theia/configs/dataset/ego4d.yaml +5 -0
theia/configs/dataset/epic_kitchen.yaml +5 -0
theia/configs/dataset/image_video_default.yaml +7 -0
theia/configs/dataset/image_video_mix.yaml +8 -0
theia/configs/dataset/imagenet.yaml +5 -0
theia/configs/dataset/oxe_octo_mix.yaml +12 -0
theia/configs/dataset/ssv2.yaml +5 -0
theia/configs/logging/default.yaml +6 -0
theia/configs/model/backbone/deit.yaml +2 -0
theia/configs/model/backbone/deit_nocls.yaml +2 -0
theia/configs/model/backbone/deit_reg.yaml +3 -0
theia/configs/model/translator/conv.yaml +3 -0
theia/configs/model/translator/lconv.yaml +3 -0
theia/configs/model/translator/mlp.yaml +4 -0
theia/configs/model/translator/transformer.yaml +5 -0
theia/configs/train_rvfm_imagenet.yaml +9 -0
theia/configs/training/frame_level.yaml +35 -0
theia/configs/training/target_models/cdds.yaml +6 -0
theia/configs/training/target_models/cddsv.yaml +7 -0
theia/configs/training/target_models/cddv.yaml +6 -0
theia/configs/training/target_models/cdesv.yaml +6 -0
theia/configs/training/target_models/cdis.yaml +5 -0
theia/configs/training/target_models/cdisv.yaml +6 -0
theia/configs/training/target_models/cdiv.yaml +5 -0
theia/configs/training/target_models/clip.yaml +3 -0
theia/configs/training/target_models/ddsv.yaml +6 -0
theia/configs/training/target_models/depth_anything.yaml +3 -0
theia/configs/training/target_models/dinov2.yaml +3 -0
theia/configs/training/target_models/sam.yaml +3 -0
theia/configs/training/target_models/vit.yaml +3 -0
theia/dataset/__init__.py +5 -0
theia/dataset/data_utils.py +591 -0
theia/dataset/image/__init__.py +3 -0
theia/dataset/image/image_common.py +5 -0
theia/dataset/oxe/__init__.py +1 -0
theia/dataset/oxe/oxe_common.py +430 -0
theia/dataset/oxe/oxe_mixes.py +139 -0
theia/dataset/oxe/oxe_transforms.py +15 -0
theia/dataset/video/__init__.py +3 -0
theia/dataset/video/video_common.py +11 -0
theia/decoding/__init__.py +5 -0
theia/decoding/decode.py +198 -0
theia/decoding/depth_anything.py +57 -0
theia/decoding/dinov2.py +69 -0
theia/decoding/sam.py +191 -0
theia/example/decode_to_vfms.ipynb +69 -0
theia/foundation_models/__init__.py +9 -0
theia/foundation_models/common.py +87 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,24 @@

+Copyright (c) 2024 Boston Dynamics AI Institute LLC
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+1. Redistributions of source code must retain the copyright notice included
+with the software, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the copyright notice, this
+list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+3. Modified versions of the software must be conspicuously marked as such.
+4. The software may only be used for non-commercial research purposes.
+For profit enterprises may use the software, subject to this limitation.
+THIS SOFTWARE IS PROVIDED BY THE AI INSTITUTE AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, NON-
+INFRINGEMENT,TITLE, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE AI INSTITUTE OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, PUNITIVE OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, DAMAGES ARISING OUT OF CLAIMS OF
+INTELLECTUAL PROPERTY RIGHTS INFRINGEMENT; PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

theia/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.

theia/configs/dataset/ego4d.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+    - image_video_default
+dataset_mix:
+    - "ego4d_1in150"

theia/configs/dataset/epic_kitchen.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+    - image_video_default
+dataset_mix:
+    - "epic_kitchen_1in60"

theia/configs/dataset/image_video_default.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+return_metadata: False
+shuffle: True
+shuffle_buffer_size: 1024
+feature_norm: True
+dataset_root: "/storage/nfs/datasets/jshang/"
+dataset_ratio: 0.1
+load_action: False

theia/configs/dataset/image_video_mix.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+defaults:
+    - image_video_default
+dataset_mix:
+    - "ego4d_1in150"
+    - "ssv2_1in32"
+    - "epic_kitchen_1in60"
+    - "imagenet"

theia/configs/dataset/imagenet.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+    - image_video_default
+dataset_mix:
+    - "imagenet"

theia/configs/dataset/oxe_octo_mix.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+_target_: dataset.oxe.oxe_data_utils.OXEDataset
+dataset_mix: "oxe_magic_soup"
+image_action_set_root: "/storage/nfs/datasets/jshang/oxe_image_action"
+feature_set_root: "/storage/nfs/datasets/jshang/oxe_vfm_features"
+image_views: null
+split: "train"
+data_portion: 0.01
+load_action: False
+bf16: True
+safe_tensors: True
+trajectory_subsample_len: 32
+return_metadata: False

theia/configs/dataset/ssv2.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+defaults:
+    - image_video_default
+dataset_mix:
+    - "ssv2_1in32"

theia/configs/logging/default.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+model_path: "/storage/nfs/jshang/trained_models"
+log_path: "/storage/nfs/jshang/logs"
+save_ckpt_interval: 20000
+notes: ""
+run_identifier_prefix: ""
+project: "theia"

theia/configs/model/backbone/deit.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ backbone: facebook/deit-small-patch16-224
2	+ pretrained: False

theia/configs/model/backbone/deit_nocls.yaml ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ backbone: nocls-facebook/deit-tiny-patch16-224
2	+ pretrained: False

theia/configs/model/backbone/deit_reg.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+backbone: reg-facebook/deit-tiny-patch16-224
+pretrained: False
+num_reg_tokens: 7

theia/configs/model/translator/conv.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: "conv"
+kwargs:
+  translator_hidden_size: 1024

theia/configs/model/translator/lconv.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+type: "lconv"
+kwargs:
+  hidden_size_factor: 1.0

theia/configs/model/translator/mlp.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+type: "mlp"
+kwargs:
+  translator_n_layer: 3
+  hidden_size: 1024

theia/configs/model/translator/transformer.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+type: "transformer"
+kwargs:
+  translator_n_layers: 2
+  translator_n_heads: 8
+  translator_hidden_size: 1024

theia/configs/train_rvfm_imagenet.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+defaults:
+  - dataset: imagenet
+  - model/backbone: deit
+  - model/translator: lconv
+  - training: frame_level
+  - logging: default
+  - _self_
+seed: 0

theia/configs/training/frame_level.yaml ADDED Viewed

	@@ -0,0 +1,35 @@

+defaults:
+ - target_models: cdiv
+epochs: 50
+warm_up_steps_ratio: 0.1
+base_lr: 2e-3
+batch_size: 16
+random_target_models: -1
+num_workers: 8
+# base training settings to scale lr, rarely changed
+base_batch_size: 64
+base_world_size: 8
+weight_decay: 0.01
+optimizer:
+  _target_: torch.optim.AdamW
+  betas: [0.9, 0.999]
+lr_scheduler:
+  _target_: theia.lr_schedulers.get_constant_lrs_with_linear_warm_up
+  warm_up_lr_start_factor: 1e-2
+grad_clip: False
+grad_clip_norm_warmup: 10.0
+grad_clip_norm: 1.0
+freeze_translator: False
+freeze_translator_start_steps_ratio: 0.2
+translator_lr_factor: 1.0
+main_loss: cos_l1

theia/configs/training/target_models/cdds.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+target_model_names:
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+ - "facebook/sam-vit-huge"
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/cddsv.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+ - "facebook/sam-vit-huge"
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/cddv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/cdesv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "openai/clip-vit-large-patch14"
+ - "facebook/sam-vit-huge"
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/cdis.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+target_model_names:
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+ - "facebook/sam-vit-huge"
+target_model_weights: null

theia/configs/training/target_models/cdisv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+ - "facebook/sam-vit-huge"
+target_model_weights: null

theia/configs/training/target_models/cdiv.yaml ADDED Viewed

	@@ -0,0 +1,5 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "facebook/dinov2-large"
+ - "openai/clip-vit-large-patch14"
+target_model_weights: null

theia/configs/training/target_models/clip.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+target_model_names:
+ - "openai/clip-vit-large-patch14"
+target_model_weights: null

theia/configs/training/target_models/ddsv.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+ - "facebook/dinov2-large"
+ - "facebook/sam-vit-huge"
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/depth_anything.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+target_model_names:
+ - "LiheYoung/depth-anything-large-hf"
+target_model_weights: null

theia/configs/training/target_models/dinov2.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+target_model_names:
+ - "facebook/dinov2-large"
+target_model_weights: null

theia/configs/training/target_models/sam.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+target_model_names:
+ - "facebook/sam-vit-huge"
+target_model_weights: null

theia/configs/training/target_models/vit.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+target_model_names:
+ - "google/vit-huge-patch14-224-in21k"
+target_model_weights: null

theia/dataset/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from .image.image_common import ALL_IMAGE_DATASETS
+from .oxe.oxe_common import ALL_OXE_DATASETS
+from .video.video_common import ALL_VIDEO_DATASETS

theia/dataset/data_utils.py ADDED Viewed

	@@ -0,0 +1,591 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+"""Defines PyTorch datasets of dataloaders for multiple image, video, and OXE datasets.
+Should use with webdataset >= 0.2.90. See https://github.com/webdataset/webdataset/pull/347"""
+import glob
+import json
+import math
+import os.path as osp
+from collections import OrderedDict
+from functools import partial
+from io import BytesIO
+from typing import Any, Callable, Generator, Iterator, Literal, Optional
+import cv2
+import numpy as np
+import omegaconf
+import torch
+import webdataset as wds
+from datasets.combine import DatasetType
+from einops import rearrange
+from numpy.typing import NDArray
+from safetensors.torch import load as sft_load
+from torch import default_generator
+from torch.utils.data import DataLoader, Dataset, IterableDataset, default_collate
+from theia.foundation_models.common import MODELS
+from theia.dataset.oxe.oxe_common import ALL_OXE_DATASETS
+from theia.dataset.oxe.oxe_mixes import OXE_NAMED_MIXES
+PACKED_FEATURES = [model_name for model_name in MODELS if "llava" not in model_name]
+def normalize_ds_weights_by_ds_len(weights: list[float], lengths: list[int]) -> tuple[list[float], float | Literal[0]]:
+    """Normalize dataset weights by dataset lengths (frames).
+    Args:
+        weights (list[float]): assigned weights.
+        lengths (list[int]): lengths of datasets.
+    Returns:
+        tuple[list[float], int]: normalized weights, and sum of the expected lengths of datasets
+    """
+    expected_lengths = [weight * length for weight, length in zip(weights, lengths, strict=False)]
+    sum_expected_lengths = sum(expected_lengths)
+    if sum_expected_lengths == 0:
+        raise ValueError("Sum of dataset length is 0.")
+    normalized_weights = [length * 1.0 / sum_expected_lengths for length in expected_lengths]
+    return normalized_weights, sum_expected_lengths
+def get_vo_keys(dataset_name: str, image_views: Optional[list | str | dict[str, str | list[str]]] = None) -> list[str]:
+    """Get visual observation keys of datasets (to be compatible with OXE).
+    Args:
+        dataset_name (str): name of the dataset.
+        image_views (Optional[dict[str, str  |  list[str]]], optional): keys of selected views.
+            Defaults to None.
+    Returns:
+        list[str]: keys to the views in the dataset.
+    """
+    default_visual_observation_keys = ALL_OXE_DATASETS[dataset_name]["visual_observation_keys"][:1]
+    visual_observation_keys = []
+    if image_views is None:
+        visual_observation_keys = default_visual_observation_keys
+    elif isinstance(image_views, list):
+        visual_observation_keys = ALL_OXE_DATASETS[dataset_name]["visual_observation_keys"]
+    elif isinstance(image_views, str):
+        if image_views == "static":
+            visual_observation_keys = [
+                k
+                for k in ALL_OXE_DATASETS[dataset_name]["visual_observation_keys"]
+                if "wrist" not in k and "hand" not in k
+            ]
+        elif image_views == "wrist":
+            visual_observation_keys = [
+                k for k in ALL_OXE_DATASETS[dataset_name]["visual_observation_keys"] if "wrist" in k or "hand" in k
+            ]
+    if len(visual_observation_keys) == 0:
+        visual_observation_keys = default_visual_observation_keys
+    return visual_observation_keys
+class RandomMix(IterableDataset):
+    """A random interleave of multiple iterable datasets."""
+    def __init__(
+        self,
+        datasets: list[IterableDataset],
+        probs: list[float] | NDArray | None = None,
+        stopping_strategy: str = "all_exhausted",
+        seed: Optional[int | str] = 0,
+    ) -> None:
+        """Initialization of a random interleave dataset.
+        Args:
+            datasets (list[IterableDataset]): datasets to be interleaved.
+            probs (list[float] | NDArray, optional): probability of each dataset. Defaults to None.
+            stopping_strategy (str, optional): when to end the sampling for one epoch. Defaults to `all_exhausted`.
+                `all_exhausted`: each sample in the dataset will be sampled at least once.
+                `first_exhausted`: when the first dataset is ran out, this episode ends.
+                See also https://huggingface.co/docs/datasets/en/stream#interleave for definitions.
+            seed (Optional[int | str]): seed. Defaults to 0.
+        """
+        self.datasets = datasets
+        if probs is None:
+            self.probs = [1.0] * len(self.datasets)
+        elif isinstance(probs, np.ndarray):
+            self.probs = probs.tolist()
+        else:
+            self.probs = probs
+        self.stopping_strategy = stopping_strategy
+        self.seed = seed
+    def __iter__(self) -> Generator:
+        """Return an iterator over the sources."""
+        sources = [iter(d) for d in self.datasets]
+        probs = self.probs[:]
+        seed_gen = torch.Generator()
+        seed_gen.manual_seed(self.seed)
+        cum = (np.array(probs) / np.sum(probs)).cumsum()
+        while len(sources) > 0:
+            r = torch.rand(1, generator=seed_gen).item()
+            i = np.searchsorted(cum, r)
+            try:
+                yield next(sources[i])
+            except StopIteration:
+                if self.stopping_strategy == "all_exhausted":
+                    del sources[i]
+                    del probs[i]
+                    cum = (np.array(probs) / np.sum(probs)).cumsum()
+                elif self.stopping_strategy == "first_exhausted":
+                    break
+def decode_sample(
+    key: str, data: bytes, image_transform: Optional[Callable] = None, feature_transform: Optional[Callable] = None
+) -> Any:
+    """Decode a sample from bytes with optional image and feature transforms
+    Args:
+        key (str): key of an attribute (a column) of the sample.
+        data (bytes): original data bytes.
+        image_transform (Optional[Callable], optional): image transform. Defaults to None.
+        feature_transform (Optional[Callable], optional): feature transform. Defaults to None.
+    Returns:
+        Any: decoded data.
+    """
+    if ".safetensors" in key:
+        sft = sft_load(data)
+        embedding = rearrange(sft["embedding"], "c h w -> (h w) c")
+        if feature_transform is not None:
+            embedding = feature_transform(embedding)
+        if "cls_token" in sft:
+            cls = sft["cls_token"]
+            if feature_transform is not None:
+                cls = feature_transform(cls)
+                return {"embedding": embedding, "cls": cls}
+        return {"embedding": embedding}
+    elif key == ".image":
+        image = np.load(BytesIO(data))
+        if len(image.shape) == 2:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        elif len(image.shape) == 3 and image.shape[-1] == 4:
+            image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
+        if image_transform is not None:
+            return image_transform(image)
+        return image
+    else:
+        return data
+def get_oxe_frame_dataset(
+    dataset_root: str,
+    dataset_mix: Optional[str | dict[str, float] | list] = "oxe_magic_soup",
+    feature_models: Optional[list[str]] = None,
+    split: str = "train",
+    dataset_ratio: float = 1.0,
+    image_views: Optional[dict[str, str | list[str]]] = None,
+    image_transform: Optional[Callable[[Any], torch.Tensor]] = None,
+    seed: Optional[int | str] = 0,
+    shuffle: bool = False,
+    world_size: int = 1,
+) -> tuple[dict[str, DatasetType], float | Literal[0]]:
+    """Get OXE datasets at frame level.
+    Args:
+        dataset_root (str): root dir of the datasets.
+        dataset_mix (Optional[str  |  dict[str, float]  |  list], optional): how to mix the datasets.
+            Defaults to "oxe_magic_soup".
+        feature_models (Optional[list[str]], optional): models to load their features. Defaults to None.
+        split (str, optional): split "train" or "val" or "test". Defaults to "train".
+        dataset_ratio (float, optional): how much data use for the (combined) dataset. Defaults to 1.0.
+        image_views (Optional[dict[str, str  |  list[str]]], optional): image views to select. Defaults to None.
+        image_transform (Optional[Callable[[Any], torch.Tensor]], optional): image transform applied to samples.
+            Defaults to None.
+        seed (Optional[int  |  str], optional): seed. Defaults to 0.
+        shuffle (bool, optional): shuffle or not. Defaults to False.
+        world_size (int, optional): world size of DDP training. Defaults to 1.
+    Returns:
+        tuple[dict[str, DatasetType], int]: a dict of {dataset name: dataset class}.
+    """
+    # read dataset mix from any acceptable form
+    if isinstance(dataset_mix, str) and dataset_mix in OXE_NAMED_MIXES:
+        dataset_mix = OrderedDict({k: v for k, v in OXE_NAMED_MIXES[dataset_mix]})
+    elif isinstance(dataset_mix, dict):
+        dataset_mix = OrderedDict(**dataset_mix)
+    elif isinstance(dataset_mix, list):
+        dataset_mix = OrderedDict({d: 1.0 for d in dataset_mix})
+    else:
+        raise ValueError(f"dataset_mix of {dataset_mix}:{type(dataset_mix)} is not supported.")
+    if split == "eval" or split == "val":
+        dataset_mix = OrderedDict({d: 1.0 for d in dataset_mix})
+    # note down the dataset weights
+    dataset_weights: list[float] = []
+    # get frame level length
+    dataset_lens: list[int] = []
+    all_feature_datasets: dict[str, DatasetType] = {}
+    for dataset in dataset_mix:
+        visual_observation_keys = get_vo_keys(dataset_name=dataset, image_views=image_views)
+        if feature_models is None:
+            feature_models = PACKED_FEATURES
+        with open(osp.join(dataset_root, dataset, "splits.json"), "r") as splitf:
+            dataset_len = json.load(splitf)[split]
+        # if the length is 0, skip
+        # this may happen for small datasets with very few shards
+        if dataset_len == 0:
+            continue
+        for vo_key in visual_observation_keys:
+            for model_name in feature_models:
+                if model_name not in PACKED_FEATURES:
+                    feature_set_name = model_name
+                    path_pattern = osp.join(
+                        dataset_root, dataset, vo_key + f"_{model_name.replace('/', '_')}", f"*-{split}*.tar"
+                    )
+                    rename_kw = {model_name: model_name.replace("/", "_") + ".safetensors"}  # replace v by k
+                elif "packed" in all_feature_datasets:
+                    continue
+                else:
+                    feature_set_name = "packed"
+                    path_pattern = osp.join(dataset_root, dataset, vo_key, f"*-{split}*.tar")
+                    rename_kw = {
+                        name: name.replace("/", "_") + ".safetensors" for name in PACKED_FEATURES
+                    }  # replace v by k
+                rename_kw["image"] = "image"
+                if feature_set_name not in all_feature_datasets:
+                    all_feature_datasets[feature_set_name] = []
+                shard_paths = sorted(glob.glob(path_pattern))
+                num_shards = len(shard_paths)
+                if num_shards < world_size * 8:
+                    shard_paths *= math.ceil(world_size * 8 / num_shards)
+                ds = (
+                    wds.WebDataset(
+                        shard_paths,
+                        nodesplitter=wds.split_by_node,
+                        workersplitter=wds.split_by_worker,
+                        detshuffle=True,
+                        shardshuffle=shuffle,
+                        seed=seed,
+                    )
+                    .decode(partial(decode_sample, image_transform=image_transform))
+                    .rename(keep=False, **rename_kw)
+                )
+                all_feature_datasets[feature_set_name].append(ds)
+            dataset_weights.append(dataset_mix[dataset])
+            dataset_lens.append(math.ceil(dataset_len * dataset_ratio))
+    normalized_dataset_weights, sum_expected_lengths = normalize_ds_weights_by_ds_len(dataset_weights, dataset_lens)
+    combined_feature_datasets: dict[str, Dataset] = {}
+    for feature_set_name, fds in all_feature_datasets.items():
+        ds = RandomMix(fds, probs=normalized_dataset_weights, stopping_strategy="all_exhausted")
+        combined_feature_datasets[feature_set_name] = ds
+    return combined_feature_datasets, sum_expected_lengths
+def get_oxe_frame_dataloader(
+    datasets: dict[str, DatasetType], batch_size: Optional[int] = None, shuffle_buffer_size: int = 1_000, **kwargs: Any
+) -> dict[str, DataLoader]:
+    """Get dataloaders of OXE datasets. Corresponding to `get_oxe_frame_dataset()`.
+    Args:
+        datasets (dict[str, DatasetType]): OXE datasets from `get_oxe_frame_dataset().
+        batch_size (Optional[int], optional): batch size. Defaults to None.
+        shuffle_buffer_size (int, optional): buffer for shuffle while streaming. Defaults to 1_000.
+    Returns:
+        dict[str, DataLoader]: dataloaders. a dict of {dataset name: dataloader}.
+    """
+    loaders = {
+        k: (
+            wds.WebLoader(datasets[k], batch_size=None, **kwargs)
+            .shuffle(shuffle_buffer_size)  # shuffle after mix
+            .batched(batch_size, collation_fn=default_collate)
+        )
+        for k in datasets
+    }
+    return loaders
+def get_oxe_frame_iterator(
+    data_loaders: dict[str, DataLoader],
+) -> Iterator[dict[str, Any]]:
+    """Get iterator from dataloders. Corresponding to `get_oxe_frame_dataloader()`.
+    Args:
+        data_loaders (dict[str, DataLoader]): dataloaders from `get_oxe_frame_dataloader()`.
+    Yields:
+        Iterator[dict[str, Any]]: data sample.
+    """
+    packed_loader = data_loaders.get("packed", None)
+    # place packed_loader at the first
+    if packed_loader is not None:
+        loaders = [packed_loader, *[data_loaders[k] for k in data_loaders if k != "packed"]]
+    else:
+        loaders = list(data_loaders.values())
+    # merge dicts
+    for data in zip(*loaders, strict=False):
+        # yield data
+        for i in range(1, len(loaders)):
+            for k in data[i]:
+                if k not in data[0]:
+                    data[0][k] = data[i][k]
+        yield data[0]
+def normalize_feature(
+    x: torch.Tensor, mean: Optional[torch.Tensor] = None, std: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Normalize the feature given mean and std.
+    Args:
+        x (torch.Tensor): input features
+        mean (Optional[torch.Tensor], optional): mean values. Defaults to None.
+        std (Optional[torch.Tensor], optional): std values. Defaults to None.
+    Returns:
+        torch.Tensor: feature after normalization
+    """
+    return x if mean is None or std is None else (x - mean) / std
+def load_feature_stats(
+    dataset_root: str, feature_models: list[str]
+) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+    """Load feature statictics (mean and variance).
+    Args:
+        dataset_root (str): root dir of the dataset (or where to hold the statistics).
+        feature_models (list[str]): names of the models/features.
+    Returns:
+        tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]: means and variances. Keys are model names.
+    """
+    feature_means: dict[str, torch.Tensor] = {}
+    feature_vars: dict[str, torch.Tensor] = {}
+    for model in feature_models:
+        model_name = model.replace("/", "_")
+        feature_means[model] = torch.from_numpy(np.load(osp.join(dataset_root, f"imagenet_mean_{model_name}.npy"))).to(
+            torch.bfloat16
+        )
+        feature_vars[model] = torch.from_numpy(np.load(osp.join(dataset_root, f"imagenet_var_{model_name}.npy"))).to(
+            torch.bfloat16
+        )
+    return feature_means, feature_vars
+def pad_shard_paths(shard_paths: list[str], num_shards: int, num_parts: int) -> list[str]:
+    """Pad shard paths to be divided by number of partitions (ranks*nodes).
+    Args:
+        shard_paths (list[str]): pathes of dataset shards.
+        num_shards (int): number of shards.
+        num_parts (int): number of partitions.
+    Returns:
+        list[str]: shard paths padded.
+    """
+    final_shard_paths = shard_paths
+    if num_shards % num_parts != 0:
+        if num_shards < num_parts - num_shards:
+            for _ in range(math.floor((num_parts - num_shards) / num_shards)):
+                final_shard_paths += shard_paths[:]
+            final_shard_paths += shard_paths[: num_parts - len(final_shard_paths)]
+        else:
+            final_shard_paths += shard_paths[: num_parts - len(final_shard_paths)]
+    return final_shard_paths
+def get_image_video_dataset(
+    dataset_root: str,
+    feature_models: list[str],
+    dataset_mix: Optional[str | dict[str, float] | list] = None,
+    split: str = "train",
+    dataset_ratio: float = 1.0,
+    image_transform: Optional[Callable[[Any], torch.Tensor]] = None,
+    feature_norm: bool = False,
+    seed: Optional[int | str] = 0,
+    shuffle: bool = False,
+    world_size: int = 1,
+    **kwargs: Any,
+) -> tuple[dict[str, DatasetType], float | Literal[0]]:
+    """Get image and video datasets at frame level.
+    Args:
+        dataset_root (str): root dir of the datasets.
+        feature_models (list[str]): models to load their features.
+        dataset_mix (Optional[str  |  dict[str, float]  |  list], optional): how to mix the datasets.
+        split (str, optional): split "train" or "val" or "test". Defaults to "train".
+        dataset_ratio (float, optional): how much data use for the (combined) dataset. Defaults to 1.0.
+        image_transform (Optional[Callable[[Any], torch.Tensor]], optional): image transform applied to samples.
+            Defaults to None.
+        feature_norm: (bool, optional): whether to normalize the feature. Defaults to False.
+        seed (Optional[int  |  str], optional): seed. Defaults to 0.
+        shuffle (bool, optional): shuffle or not. Defaults to False.
+        world_size (int, optional): world size of DDP training. Defaults to 1.
+        kwargs (Any): arguments to pass-through.
+    Returns:
+        tuple[dict[str, DatasetType], int]: a dict of {dataset name: dataset class}.
+    """
+    # read dataset mix from any acceptable form
+    if isinstance(dataset_mix, str) and dataset_mix in OXE_NAMED_MIXES:
+        dataset_mix = OrderedDict({k: v for k, v in OXE_NAMED_MIXES[dataset_mix]})
+    elif isinstance(dataset_mix, dict):
+        dataset_mix = OrderedDict(**dataset_mix)
+    elif isinstance(dataset_mix, list) or isinstance(dataset_mix, omegaconf.listconfig.ListConfig):
+        dataset_mix = OrderedDict({d: 1.0 for d in dataset_mix})
+    else:
+        raise ValueError(f"dataset_mix of {dataset_mix}:{type(dataset_mix)} is not supported.")
+    if split == "eval" or split == "val":
+        dataset_mix = OrderedDict({d: 1.0 for d in dataset_mix})
+    # note down the dataset weights
+    dataset_weights: list[float] = []
+    # get frame level length
+    dataset_lens: list[int] = []
+    all_feature_datasets: dict[str, DatasetType] = {}
+    if feature_norm:
+        feature_means, feature_vars = load_feature_stats(dataset_root, feature_models)
+    for d in dataset_mix:
+        with open(osp.join(dataset_root, d, "splits.json"), "r") as splitf:
+            dataset_len = json.load(splitf)[split]
+        # if the length is 0, skip
+        # this may happen for small datasets with very few shards
+        if dataset_len == 0:
+            continue
+        path_pattern = osp.join(dataset_root, d, "images", f"*-{split}.tar")
+        if "image" not in all_feature_datasets:
+            all_feature_datasets["image"] = []
+        shard_paths = sorted(glob.glob(path_pattern))
+        num_shards = len(shard_paths)
+        num_parts = world_size
+        final_shard_paths = pad_shard_paths(shard_paths, num_shards, num_parts)
+        ds = wds.WebDataset(
+            final_shard_paths,
+            nodesplitter=wds.split_by_node,
+            workersplitter=wds.split_by_worker,
+            detshuffle=True,
+            shardshuffle=shuffle,
+            seed=seed,
+        ).decode(partial(decode_sample, image_transform=image_transform))
+        all_feature_datasets["image"].append(ds)
+        for model_name in feature_models:
+            path_pattern = osp.join(dataset_root, d, f"{model_name.replace('/', '_')}", f"*-{split}.tar")
+            rename_kw = {model_name: model_name.replace("/", "_").lower() + ".safetensors"}  # replace v by k
+            if model_name not in all_feature_datasets:
+                all_feature_datasets[model_name] = []
+            shard_paths = sorted(glob.glob(path_pattern))
+            num_shards = len(shard_paths)
+            num_parts = world_size
+            final_shard_paths = pad_shard_paths(shard_paths, num_shards, num_parts)
+            if feature_norm:
+                feature_transform = partial(
+                    normalize_feature, mean=feature_means[model_name], std=feature_vars[model_name]
+                )
+            else:
+                feature_transform = None
+            ds = (
+                wds.WebDataset(
+                    final_shard_paths,
+                    nodesplitter=wds.split_by_node,
+                    workersplitter=wds.split_by_worker,
+                    detshuffle=True,
+                    shardshuffle=shuffle,
+                    seed=seed,
+                )
+                .decode(partial(decode_sample, image_transform=image_transform, feature_transform=feature_transform))
+                .rename(keep=False, **rename_kw)
+            )
+            all_feature_datasets[model_name].append(ds)
+        dataset_weights.append(dataset_mix[d])
+        dataset_lens.append(math.ceil(dataset_len * dataset_ratio))
+    normalized_dataset_weights, sum_expected_lengths = normalize_ds_weights_by_ds_len(dataset_weights, dataset_lens)
+    combined_feature_datasets: dict[str, Dataset] = {}
+    for feature_set_name, fds in all_feature_datasets.items():
+        ds = RandomMix(fds, probs=normalized_dataset_weights, stopping_strategy="all_exhausted", seed=seed)
+        combined_feature_datasets[feature_set_name] = ds
+    return combined_feature_datasets, sum_expected_lengths
+def get_frame_dataloader(
+    datasets: dict[str, DatasetType],
+    batch_size: Optional[int] = None,
+    shuffle: bool = False,
+    shuffle_buffer_size: int = 1_000,
+    seed: Optional[int] = 0,
+    **kwargs: Any,
+) -> dict[str, DataLoader]:
+    """Get dataloaders of image and video datasets. Corresponding to `get_image_video_dataset()`.
+    Args:
+        datasets (dict[str, DatasetType]): image and video datasets from `get_image_video_dataset().
+        batch_size (Optional[int], optional): batch size. Defaults to None.
+        shuffle_buffer_size (int, optional): buffer for shuffle while streaming. Defaults to 1_000.
+    Returns:
+        dict[str, DataLoader]: dataloaders. a dict of {dataset name: dataloader}.
+    """
+    loaders = {}
+    for k in datasets:
+        loader = wds.WebLoader(datasets[k], batch_size=None, generator=default_generator, **kwargs)
+        if shuffle:
+            loader = loader.shuffle(shuffle_buffer_size, seed=seed)  # shuffle after mix
+        loader = loader.batched(batch_size, collation_fn=default_collate)
+        loaders[k] = loader
+    return loaders
+def get_frame_iterator(
+    data_loaders: dict[str, DataLoader],
+) -> Iterator[dict[str, Any]]:
+    """Get iterator from image and video dataset dataloders. Corresponding to `get_frame_dataloader()`.
+    Args:
+        data_loaders (dict[str, DataLoader]): dataloaders from `get_frame_dataloader()`.
+    Yields:
+        Iterator[dict[str, Any]]: data sample.
+    """
+    packed_loader = data_loaders.get("packed", None)
+    # place packed_loader at the first
+    if packed_loader is not None:
+        loaders = [packed_loader, *[data_loaders[k] for k in data_loaders if k != "packed"]]
+    else:
+        loaders = list(data_loaders.values())
+    # merge dicts
+    # this is to accommodate the old organization of datasets (each shard contains one or more columns,
+    # and images are duplicated columns).
+    # In new (current) dataset organization (columns are completely separated),
+    # column keys are all different except some "built-in" keys added by webdataset,
+    # but they are not related to any data, training, so on.
+    # During transit from old to new, where two organizations exist at the same time,
+    # this is to ignore extra "image" field in datasets loaded.
+    for data in zip(*loaders, strict=False):
+        # yield data
+        for i in range(1, len(loaders)):
+            for k in data[i]:
+                if k not in data[0]:
+                    data[0][k] = data[i][k]
+        yield data[0]

theia/dataset/image/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
2	+
3	+ from .image_common import ALL_IMAGE_DATASETS

theia/dataset/image/image_common.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from collections import OrderedDict
+ALL_IMAGE_DATASETS = OrderedDict({"imagenet": {"steps": 1_281_167}})

theia/dataset/oxe/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.

theia/dataset/oxe/oxe_common.py ADDED Viewed

	@@ -0,0 +1,430 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from collections import OrderedDict
+from typing import Optional
+"""
+This ALL_OXE_DATASETS below records metadata of all subsets of OXE dataset.
+The datasets are in alphabetical order.
+versions (list[str]): available and usable versions, sorted from older to newer.
+                      Usually use the last one.
+episodes (int): total episodes in the dataset.
+steps    (int): total steps in the dataset.
+visual_observation_keys (list[str]): keys to specify image observations.
+"""
+ALL_OXE_DATASETS: OrderedDict = OrderedDict(
+    {
+        "agent_aware_affordances": {
+            "versions": ["1.0.0"],
+            "episodes": 118,
+            "steps": 151628,
+            "visual_observation_keys": ["image"],
+        },
+        "asu_table_top_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 110,
+            "steps": 26113,
+            "visual_observation_keys": ["image"],
+        },
+        "austin_buds_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 50,
+            "steps": 34112,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "austin_sailor_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 240,
+            "steps": 353094,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "austin_sirius_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 559,
+            "steps": 279939,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "bc_z": {
+            "versions": [
+                "0.1.0",  # "1.0.0", "old1.0.1", and "1.0.1" are not usable
+            ],
+            "episodes": 39350,
+            "steps": 5471693,
+            "visual_observation_keys": ["image"],
+        },
+        "berkeley_autolab_ur5": {
+            "versions": ["0.1.0"],
+            "episodes": 896,
+            "steps": 87783,
+            "visual_observation_keys": ["image", "hand_image"],
+        },
+        "berkeley_cable_routing": {
+            "versions": ["0.1.0"],
+            "episodes": 1482,
+            "steps": 38240,
+            "visual_observation_keys": ["image", "top_image", "wrist225_image", "wrist45_image"],
+        },
+        "berkeley_fanuc_manipulation": {
+            "versions": ["0.1.0"],
+            "episodes": 415,
+            "steps": 62613,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "berkeley_gnm_cory_hall": {
+            "versions": ["0.1.0"],
+            "episodes": 7331,
+            "steps": 156012,
+            "visual_observation_keys": ["image"],
+        },
+        "berkeley_gnm_recon": {
+            "versions": ["0.1.0"],
+            "episodes": 11834,
+            "steps": 610907,
+            "visual_observation_keys": ["image"],
+        },
+        "berkeley_gnm_sac_son": {
+            "versions": ["0.1.0"],
+            "episodes": 2955,
+            "steps": 241059,
+            "visual_observation_keys": ["image"],
+        },
+        "berkeley_mvp_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 480,
+            "steps": 45308,
+            "visual_observation_keys": ["hand_image"],
+        },
+        "berkeley_rpt_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 908,
+            "steps": 392578,
+            "visual_observation_keys": ["hand_image"],
+        },
+        "bridge": {"versions": ["0.1.0"], "episodes": 25460, "steps": 864292, "visual_observation_keys": ["image"]},
+        "cmu_franka_exploration_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 199,
+            "steps": 1990,
+            "visual_observation_keys": ["image"],
+        },
+        "cmu_play_fusion": {
+            "versions": ["0.1.0"],
+            "episodes": 576,
+            "steps": 235922,
+            "visual_observation_keys": ["image"],
+        },
+        "cmu_playing_with_food": {  # this dataset seems to be corrupted
+            "versions": ["1.0.0"],
+            "episodes": 4200,
+            "steps": 83240,
+            "visual_observation_keys": ["image"],
+        },
+        "cmu_stretch": {"versions": ["0.1.0"], "episodes": 135, "steps": 25016, "visual_observation_keys": ["image"]},
+        "columbia_cairlab_pusht_real": {
+            "versions": ["0.1.0"],
+            "episodes": 122,
+            "steps": 24924,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "dlr_edan_shared_control_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 104,
+            "steps": 8928,
+            "visual_observation_keys": ["image"],
+        },
+        "dlr_sara_grid_clamp_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 107,
+            "steps": 7622,
+            "visual_observation_keys": ["image"],
+        },
+        "dlr_sara_pour_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 100,
+            "steps": 12971,
+            "visual_observation_keys": ["image"],
+        },
+        "eth_agent_affordances": {
+            "versions": ["0.1.0"],
+            "episodes": 118,
+            "steps": 151628,
+            "visual_observation_keys": ["image"],
+        },
+        "fanuc_manipulation_v2": {
+            "versions": ["1.0.0"],
+            "episodes": 415,
+            "steps": 62613,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "fractal20220817_data": {
+            "versions": ["0.1.0"],
+            "episodes": 87212,
+            "steps": 3786400,
+            "visual_observation_keys": ["image"],
+        },
+        "furniture_bench_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 5100,
+            "steps": 3948057,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "iamlab_cmu_pickup_insert_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 631,
+            "steps": 146241,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "imperial_wrist_dataset": {
+            "versions": ["1.0.0"],
+            "episodes": 170,
+            "steps": 7148,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "imperialcollege_sawyer_wrist_cam": {
+            "versions": ["0.1.0"],
+            "episodes": 170,
+            "steps": 7148,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "jaco_play": {
+            "versions": ["0.1.0"],
+            "episodes": 976,
+            "steps": 70127,
+            "visual_observation_keys": ["image", "image_wrist"],
+        },
+        "kaist_nonprehensile_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 201,
+            "steps": 32429,
+            "visual_observation_keys": ["image"],
+        },
+        "kuka": {"versions": ["0.1.0"], "episodes": 580392, "steps": 8583978, "visual_observation_keys": ["image"]},
+        "language_table": {
+            "versions": ["0.0.1", "0.1.0"],
+            "episodes": 442226,
+            "steps": 7045476,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktoabsolute_oracle_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 200000,
+            "steps": 15866385,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktoblock_4block_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 8298,
+            "steps": 326768,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktoblock_oracle_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 200000,
+            "steps": 12970620,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktoblock_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 8000,
+            "steps": 351688,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktoblockrelative_oracle_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 200000,
+            "steps": 13016749,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_blocktorelative_oracle_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 200000,
+            "steps": 8655815,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_separate_oracle_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 200000,
+            "steps": 3196661,
+            "visual_observation_keys": ["rgb"],
+        },
+        "language_table_sim": {
+            "versions": ["0.0.1"],
+            "episodes": 181020,
+            "steps": 4665423,
+            "visual_observation_keys": ["rgb"],
+        },
+        "maniskill_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 30213,
+            "steps": 4537402,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "mutex_dataset": {
+            "versions": ["1.0.0"],
+            "episodes": 1500,
+            "steps": 361883,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "nyu_door_opening_surprising_effectiveness": {
+            "versions": ["0.1.0"],
+            "episodes": 435,
+            "steps": 18196,
+            "visual_observation_keys": ["image"],
+        },
+        "nyu_franka_play_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 365,
+            "steps": 34448,
+            "visual_observation_keys": ["image", "image_additional_view"],
+        },
+        "nyu_rot_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 14,
+            "steps": 440,
+            "visual_observation_keys": ["image"],
+        },
+        "qut_dexterous_manpulation": {
+            "versions": ["0.1.0"],
+            "episodes": 200,
+            "steps": 176278,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "robo_net": {
+            "versions": ["0.1.0", "1.0.0"],
+            "episodes": 82775,
+            "steps": 2483250,
+            "visual_observation_keys": ["image", "image1", "image2"],
+        },
+        "robot_vqa": {
+            "versions": ["0.1.0"],
+            "episodes": 3331523,
+            "steps": 3331523,
+            "visual_observation_keys": ["images"],
+        },
+        "roboturk": {
+            "versions": ["0.1.0"],
+            "episodes": 1796,
+            "steps": 168423,
+            "visual_observation_keys": ["front_rgb"],
+        },
+        "stanford_hydra_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 570,
+            "steps": 358234,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "stanford_kuka_multimodal_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 3000,
+            "steps": 149985,
+            "visual_observation_keys": ["image"],
+        },
+        "stanford_mask_vit_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 9109,
+            "steps": 282379,
+            "visual_observation_keys": ["image"],
+        },
+        "stanford_robocook_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 2460,
+            "steps": 112980,
+            "visual_observation_keys": ["image_1", "image_2", "image_3", "image_4"],
+        },
+        "taco_play": {
+            "versions": ["0.1.0"],
+            "episodes": 3242,
+            "steps": 213972,
+            "visual_observation_keys": ["rgb_static", "rgb_gripper"],
+        },
+        "tokyo_u_lsmo_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 50,
+            "steps": 11925,
+            "visual_observation_keys": ["image"],
+        },
+        "toto": {"versions": ["0.1.0"], "episodes": 902, "steps": 294139, "visual_observation_keys": ["image"]},
+        "ucsd_kitchen_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 150,
+            "steps": 3970,
+            "visual_observation_keys": ["image"],
+        },
+        "ucsd_pick_and_place_dataset_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 1355,
+            "steps": 67750,
+            "visual_observation_keys": ["image"],
+        },
+        "uiuc_d3field": {  # this dataset seems to be corrupted
+            "versions": ["0.1.0", "1.1.2"],
+            "episodes": 196,
+            "steps": 13384,
+            "visual_observation_keys": ["image_1", "image_2", "image_3", "image_4"],
+        },
+        "usc_cloth_sim_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 800,
+            "steps": 80000,
+            "visual_observation_keys": ["image"],
+        },
+        "utaustin_mutex": {
+            "versions": ["0.1.0"],
+            "episodes": 1500,
+            "steps": 361883,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "utokyo_pr2_opening_fridge_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 64,
+            "steps": 9140,
+            "visual_observation_keys": ["image"],
+        },
+        "utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 192,
+            "steps": 26346,
+            "visual_observation_keys": ["image"],
+        },
+        "utokyo_saytap_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 20,
+            "steps": 22937,
+            "visual_observation_keys": ["image", "wrist_image"],
+        },
+        "utokyo_xarm_bimanual_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 64,
+            "steps": 1388,
+            "visual_observation_keys": ["image"],
+        },
+        "utokyo_xarm_pick_and_place_converted_externally_to_rlds": {
+            "versions": ["0.1.0"],
+            "episodes": 92,
+            "steps": 6789,
+            "visual_observation_keys": ["image", "hand_image", "image2"],
+        },
+        "viola": {
+            "versions": ["0.1.0"],
+            "episodes": 135,
+            "steps": 68913,
+            "visual_observation_keys": ["agentview_rgb", "eye_in_hand_rgb"],
+        },
+    }
+)
+def oxe_dsname2path(dataset_name: str, version: Optional[str] = None) -> str:
+    """From dataset name to remote google clound path to the dataset.
+    Args:
+        dataset_name (str): dataset name.
+        version (Optional[str]): version string.
+    Returns:
+        str: google clound path
+    """
+    if version is None:
+        version = ALL_OXE_DATASETS[dataset_name]["versions"][-1]
+    return f"gs://gresearch/robotics/{dataset_name}/{version}"

theia/dataset/oxe/oxe_mixes.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# File modified. Modifications Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+"""MIT License Copyright (c) 2023 Robotic AI & Learning Lab Berkeley
+From Octo https://github.com/octo-models/octo/blob/main/octo/data/oxe/oxe_dataset_mixes.py
+"""
+BRIDGE_MIX = [
+    ("bridge_dataset", 1.0),
+]
+RT_X_MIX = [
+    ("fractal20220817_data", 0.54087122203),
+    ("kuka", 0.8341046294),
+    ("bridge_dataset", 1.0),
+    ("taco_play", 2.0),
+    ("jaco_play", 2.0),
+    ("berkeley_cable_routing", 3.0),
+    ("roboturk", 1.0),
+    ("nyu_door_opening_surprising_effectiveness", 5.0),
+    ("viola", 2.0),
+    ("berkeley_autolab_ur5", 1.0),
+    ("toto", 1.0),
+]
+OXE_FRANKA_MIX = [
+    ("taco_play", 1.0),
+    ("berkeley_cable_routing", 1.0),
+    ("viola", 1.0),
+    ("toto", 1.0),
+    ("stanford_hydra_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_buds_dataset_converted_externally_to_rlds", 3.0),
+    ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+    ("maniskill_dataset_converted_externally_to_rlds", 0.1),
+    ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+    ("cmu_franka_exploration_dataset_converted_externally_to_rlds", 5.0),
+    ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+    ("berkeley_rpt_converted_externally_to_rlds", 1.0),
+    ("kaist_nonprehensile_converted_externally_to_rlds", 3.0),
+    ("stanford_robocook_converted_externally_to_rlds", 1.0),
+    ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+    ("utaustin_mutex", 1.0),
+    # ("cmu_playing_with_food", 1.0),
+    ("cmu_play_fusion", 1.0),
+]
+OXE_MAGIC_SOUP = [
+    ("fractal20220817_data", 0.54087122203),
+    ("kuka", 0.8341046294),
+    ("bridge", 1.0),
+    ("taco_play", 2.0),
+    ("jaco_play", 1.0),
+    ("berkeley_cable_routing", 1.0),
+    ("roboturk", 2.0),
+    ("nyu_door_opening_surprising_effectiveness", 1.0),
+    ("viola", 2.0),
+    ("berkeley_autolab_ur5", 2.0),
+    ("toto", 1.0),
+    ("language_table", 0.1),
+    ("stanford_hydra_dataset_converted_externally_to_rlds", 2.0),
+    ("austin_buds_dataset_converted_externally_to_rlds", 1.0),
+    ("nyu_franka_play_dataset_converted_externally_to_rlds", 3.0),
+    ("furniture_bench_dataset_converted_externally_to_rlds", 0.1),
+    ("ucsd_kitchen_dataset_converted_externally_to_rlds", 2.0),
+    ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+    ("bc_z", 0.2),
+    ("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
+    ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+    # ("uiuc_d3field", 1.0),  --> somehow raw data is broken
+    ("utaustin_mutex", 1.0),
+    ("berkeley_fanuc_manipulation", 2.0),
+    ("cmu_stretch", 1.0),
+]
+OXE_FULL_MIX = [
+    ("fractal20220817_data", 1.0),
+    ("kuka", 1.0),
+    ("bridge_dataset", 1),
+    ("taco_play", 1.0),
+    ("jaco_play", 1.0),
+    ("berkeley_cable_routing", 1.0),
+    ("roboturk", 1.0),
+    ("nyu_door_opening_surprising_effectiveness", 1.0),
+    ("viola", 1.0),
+    ("berkeley_autolab_ur5", 1.0),
+    ("toto", 1.0),
+    ("language_table", 1.0),
+    ("columbia_cairlab_pusht_real", 1.0),
+    ("stanford_kuka_multimodal_dataset_converted_externally_to_rlds", 1.0),
+    ("nyu_rot_dataset_converted_externally_to_rlds", 1.0),
+    ("stanford_hydra_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_buds_dataset_converted_externally_to_rlds", 1.0),
+    ("nyu_franka_play_dataset_converted_externally_to_rlds", 1.0),
+    ("maniskill_dataset_converted_externally_to_rlds", 1.0),
+    ("furniture_bench_dataset_converted_externally_to_rlds", 1.0),
+    ("cmu_franka_exploration_dataset_converted_externally_to_rlds", 1.0),
+    ("ucsd_kitchen_dataset_converted_externally_to_rlds", 1.0),
+    ("ucsd_pick_and_place_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_sailor_dataset_converted_externally_to_rlds", 1.0),
+    ("austin_sirius_dataset_converted_externally_to_rlds", 1.0),
+    ("bc_z", 1.0),
+    ("utokyo_pr2_opening_fridge_converted_externally_to_rlds", 1.0),
+    ("utokyo_pr2_tabletop_manipulation_converted_externally_to_rlds", 1.0),
+    ("utokyo_xarm_pick_and_place_converted_externally_to_rlds", 1.0),
+    ("utokyo_xarm_bimanual_converted_externally_to_rlds", 1.0),
+    ("robo_net", 1.0),
+    ("berkeley_mvp_converted_externally_to_rlds", 1.0),
+    ("berkeley_rpt_converted_externally_to_rlds", 1.0),
+    ("kaist_nonprehensile_converted_externally_to_rlds", 1.0),
+    ("stanford_mask_vit_converted_externally_to_rlds", 1.0),
+    ("tokyo_u_lsmo_converted_externally_to_rlds", 1.0),
+    ("dlr_sara_pour_converted_externally_to_rlds", 1.0),
+    ("dlr_sara_grid_clamp_converted_externally_to_rlds", 1.0),
+    ("dlr_edan_shared_control_converted_externally_to_rlds", 1.0),
+    ("asu_table_top_converted_externally_to_rlds", 1.0),
+    ("stanford_robocook_converted_externally_to_rlds", 1.0),
+    ("imperialcollege_sawyer_wrist_cam", 1.0),
+    ("iamlab_cmu_pickup_insert_converted_externally_to_rlds", 1.0),
+    ("uiuc_d3field", 1.0),
+    ("utaustin_mutex", 1.0),
+    ("berkeley_fanuc_manipulation", 1.0),
+    ("cmu_playing_with_food", 1.0),
+    ("cmu_play_fusion", 1.0),
+    ("cmu_stretch", 1.0),
+    ("berkeley_gnm_recon", 1.0),
+    ("berkeley_gnm_cory_hall", 1.0),
+    ("berkeley_gnm_sac_son", 1.0),
+]
+OXE_NAMED_MIXES = {
+    "bridge": BRIDGE_MIX,
+    "rtx": RT_X_MIX,
+    "rtx_franka": RT_X_MIX + OXE_FRANKA_MIX,
+    "oxe_magic_soup": OXE_MAGIC_SOUP,
+}

theia/dataset/oxe/oxe_transforms.py ADDED Viewed

	@@ -0,0 +1,15 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+import torch
+from numpy.typing import NDArray
+from torchvision.transforms.v2 import Compose, Normalize, ToDtype, ToImage
+def totensor(arr: NDArray) -> torch.Tensor:
+    """Convert ndarray to tensor."""
+    return torch.from_numpy(arr)
+oxe_image_transform = Compose(
+    [ToImage(), ToDtype(torch.float32, scale=True), Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])]
+)  # ImageNet statistics normalization

theia/dataset/video/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ # Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
2	+
3	+ from .video_common import ALL_VIDEO_DATASETS

theia/dataset/video/video_common.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from collections import OrderedDict
+ALL_VIDEO_DATASETS = OrderedDict(
+    {
+        "ego4d_1in150": {"steps": 2_800_871},
+        "epic_kitchen_1in60": {"steps": 333_117},
+        "ssv2_1in32": {"steps": 312_772},
+    }
+)

theia/decoding/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from .decode import decode_everything, load_feature_stats
+from .depth_anything import prepare_depth_decoder
+from .sam import prepare_mask_generator

theia/decoding/decode.py ADDED Viewed

	@@ -0,0 +1,198 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+import os
+from typing import Optional
+import numpy as np
+import torch
+import torch.nn as nn
+from einops import rearrange
+from numpy.typing import NDArray
+from PIL import Image
+from sklearn.decomposition import PCA
+from transformers import SamModel, SamProcessor
+from transformers.pipelines import MaskGenerationPipeline
+from theia.decoding.depth_anything import decode_depth_anything
+from theia.decoding.dinov2 import decode_dinov2
+from theia.decoding.sam import decode_sam
+from theia.preprocessing.feature_extraction_core import (
+    get_feature_outputs,
+    get_model,
+)
+def denormalize_feature(
+    x: torch.Tensor, mean: Optional[torch.Tensor] = None, std: Optional[torch.Tensor] = None
+) -> torch.Tensor:
+    """Denormalize the features using mean and std.
+    Args:
+        x (torch.Tensor): features to be denomalized.
+        mean (Optional[torch.Tensor], optional): mean value of the features. Defaults to None
+        std (Optional[torch.Tensor], optional): std value of the features. Defaults to None.
+    Returns:
+        torch.Tensor: denormalized features.
+    """
+    if mean is None and std is None:
+        return x
+    elif mean is None and std is not None:
+        return x * std
+    elif mean is not None and std is None:
+        return x + mean
+    return x * std + mean
+def load_feature_stats(
+    feature_models: list[str], stat_file_root: str
+) -> tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]:
+    """Load the statistics (mean and variance) of the features, per model.
+    Args:
+        feature_models (list[str]): names of the models. Note: there are `/` in the name.
+        stat_file_root (str): directory that holds feature stat files.
+    Returns:
+        tuple[dict[str, torch.Tensor], dict[str, torch.Tensor]]: means and variance.
+    """
+    feature_means: dict[str, torch.Tensor] = {}
+    feature_vars: dict[str, torch.Tensor] = {}
+    for model in feature_models:
+        model_name = model.replace("/", "_")
+        feature_means[model] = torch.from_numpy(
+            np.load(os.path.join(stat_file_root, f"imagenet_mean_{model_name}.npy"))
+        )
+        feature_vars[model] = torch.from_numpy(np.load(os.path.join(stat_file_root, f"imagenet_var_{model_name}.npy")))
+    return feature_means, feature_vars
+def decode_everything(
+    theia_model: nn.Module,
+    feature_means: dict[str, torch.Tensor],
+    feature_vars: dict[str, torch.Tensor],
+    images: list[Image.Image],
+    mask_generator: MaskGenerationPipeline,
+    sam_model: SamModel,
+    depth_anything_decoder: nn.Module,
+    pred_iou_thresh: float = 0.9,
+    stability_score_thresh: float = 0.9,
+    gt: bool = False,
+    pca: Optional[PCA] = None,
+    device: int | str | torch.device = 0,
+) -> tuple[list[NDArray], Optional[list[NDArray]]]:
+    """Decode features from given `theia_model` into different outputs corresponding to upstream models including
+        DINOv2, Sam, and Depth-Anything.
+    Args:
+        theia_model (nn.Module): theia model.
+        feature_means (dict[str, torch.Tensor]): means of the features for denormalization.
+        feature_vars (dict[str, torch.Tensor]): variance of the features for denormalization.
+        images (list[Image.Image]): input images.
+        mask_generator (MaskGenerationPipeline): mask generation pipeline.
+        sam_model (SamModel): sam model.
+        depth_anything_decoder (nn.Module): depth anything decoder.
+        pred_iou_thresh (float, optional): iou threshold for mask generation.
+            See transformers.pipelines.MaskGenerationPipeline for more details. Defaults to 0.9.
+        stability_score_thresh (float, optional): stability score threshold for mask generation.
+            See transformers.pipelines.MaskGenerationPipeline for more details. Defaults to 0.9.
+        gt (bool): whether to attach ground truth result in the visualization. Defaults to False.
+        pca (Optional[PCA]): pca for DINOv2 decoding. If provided, will use this pca particular. Defaults to None.
+        device (int | str | torch.device, optional): device for decoding. Defaults to 0.
+    Returns:
+        tuple[list[NDArray], Optional[list[NDArray]]]: decoding results from given model,
+            and ground truth (if `gt=True`).
+    """
+    features: dict[str, torch.Tensor] = {}
+    with torch.no_grad():
+        for im in images:
+            feature = theia_model([im])
+            if len(features) == 0:
+                features = {k: [] for k in feature}
+            for k in feature:
+                features[k].append(feature[k].detach().cpu())
+    for k in features:
+        features[k] = torch.cat(features[k], dim=0)
+    for m in features:
+        features[m] = denormalize_feature(features[m], feature_means[m], feature_vars[m])
+    dino_model_name = "facebook/dinov2-large"
+    sam_model_name = "facebook/sam-vit-huge"
+    depth_anything_model_name = "LiheYoung/depth-anything-large-hf"
+    pca = None
+    # gt
+    gt_decode_results = None
+    if gt:
+        def legit_model_name(model_name: str) -> str:
+            return model_name.replace("/", "_")
+        dino_model, dino_processor = get_model(dino_model_name, device=device)
+        dino_gt_feature = []
+        for im in images:
+            dino_gt_feature.append(
+                get_feature_outputs(
+                    legit_model_name(dino_model_name), dino_model, dino_processor, [im], dtype=torch.float
+                )[legit_model_name(dino_model_name)]["embedding"]
+                .detach()
+                .cpu()
+            )
+        dino_gt_feature = torch.cat(dino_gt_feature, dim=0)
+        dino_gt_feature = rearrange(dino_gt_feature, "b c h w -> b (h w) c")
+        dino_gt_dec, pca = decode_dinov2(dino_gt_feature, pca=pca)
+        sam_processor = SamProcessor.from_pretrained(sam_model_name)
+        sam_gt_feature = []
+        for im in images:
+            sam_inputs = sam_processor(images=[im], return_tensors="pt").to(device)
+            with torch.no_grad():
+                sam_gt_feature.append(sam_model.get_image_embeddings(sam_inputs["pixel_values"]).detach().cpu())
+        sam_gt_feature = torch.cat(sam_gt_feature, dim=0)
+        sam_gt_feature = rearrange(sam_gt_feature, "b c h w -> b (h w) c")
+        sam_gt_dec = decode_sam(
+            sam_gt_feature, images, mask_generator, pred_iou_thresh=0.9, stability_score_thresh=0.9, device=device
+        )
+        depth_anything_model, depth_anything_processor = get_model(depth_anything_model_name, device=device)
+        depth_anything_gt_feature = []
+        for im in images:
+            depth_anything_gt_feature.append(
+                get_feature_outputs(
+                    legit_model_name(depth_anything_model_name),
+                    depth_anything_model,
+                    depth_anything_processor,
+                    [im],
+                    dtype=torch.float,
+                )[legit_model_name(depth_anything_model_name)]["embedding"]
+                .detach()
+                .cpu()
+            )
+        depth_anything_gt_feature = torch.cat(depth_anything_gt_feature, dim=0)
+        depth_anything_gt_feature = rearrange(depth_anything_gt_feature, "b c h w -> b (h w) c")
+        depth_gt_dec = decode_depth_anything(depth_anything_gt_feature, depth_anything_decoder, device=device)
+        gt_decode_results = [
+            np.hstack([np.array(images[i]).astype(np.float32) / 255.0, dino_gt_dec[i], sam_gt_dec[i], depth_gt_dec[i]])
+            for i in range(len(images))
+        ]
+    dino_dec, _ = decode_dinov2(features[dino_model_name], pca=pca)
+    try:
+        sam_dec = decode_sam(
+            features[sam_model_name],
+            images,
+            mask_generator,
+            pred_iou_thresh=pred_iou_thresh,
+            stability_score_thresh=stability_score_thresh,
+            device=device,
+        )
+    except IndexError:
+        sam_dec = np.zeros_like(dino_dec)
+    depth_dec = decode_depth_anything(features[depth_anything_model_name], depth_anything_decoder, device=device)
+    theia_decode_results = [
+        np.hstack([np.array(images[i]).astype(np.float32) / 255.0, dino_dec[i], sam_dec[i], depth_dec[i]])
+        for i in range(len(images))
+    ]
+    return theia_decode_results, gt_decode_results

theia/decoding/depth_anything.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+import torch
+import torch.nn as nn
+from einops import rearrange
+from theia.foundation_models.vision_models.depth_anything import DepthAnythingForDepthEstimation
+from numpy.typing import NDArray
+from torch.nn.functional import interpolate
+def prepare_depth_decoder(model_name: str, device: int | str | torch.device = 0) -> tuple[nn.Module, int]:
+    """Prepare a depth decoder using DepthAnythingForDepthEstimation.
+    Args:
+        model_name (str): name of the depth anything model.
+        device (int | str | torch.device, optional): device to put the model on. Defaults to 0.
+    Returns:
+        tuple[nn.Module, int]: the decoder, and the patch size for depth anything model.
+    """
+    decoder_head = DepthAnythingForDepthEstimation.from_pretrained(model_name)
+    patch_size = decoder_head.config.patch_size
+    decoder_head = decoder_head.head
+    decoder_head = decoder_head.to(device)
+    return decoder_head, patch_size
+def decode_depth_anything(features: torch.Tensor, decoder: nn.Module, device: int | str | torch.device = 0) -> NDArray:
+    """Decode features to predicted depth using depth anything
+    Args:
+        features (torch.Tensor): features to be decoded, should be in shape [batch_size, num_tokens, latent_dim].
+        decoder (nn.Module): depth anything decoder
+        device (int | str | torch.device, optional): device to perform the decoding. Defaults to 0.
+    Returns:
+        NDArray: decoded depth in image format, represented by an NDArray in size [batch_size, height, width, channels]
+            with value between [0, 1]. The depth values are min-max normalized to [0, 1] to generate images.
+    """
+    with torch.no_grad():
+        P = int(features.size(1) ** 0.5)
+        features = rearrange(features, "b (h w) c -> b c h w", h=P, w=P)
+        features = interpolate(features, (224, 224))
+        predicted_depths = []
+        for feature in features:
+            feature = feature.unsqueeze(0).to(device)
+            predicted_depth = decoder.activation1(feature)
+            predicted_depth = decoder.conv3(predicted_depth)
+            predicted_depth = decoder.activation2(predicted_depth)
+            predicted_depth = predicted_depth.squeeze(dim=1)  # shape (batch_size, height, width)
+            for i in range(len(predicted_depth)):
+                min_depth, max_depth = predicted_depth[i].min(), predicted_depth[i].max()
+                predicted_depth[i] = (predicted_depth[i] - min_depth) / (max_depth - min_depth)
+            predicted_depths.append(predicted_depth.detach().cpu())
+        predicted_depths = torch.cat(predicted_depths, dim=0)
+    return predicted_depths.unsqueeze(-1).repeat((1, 1, 1, 3)).numpy()  # type: ignore [attr-defined]

theia/decoding/dinov2.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from typing import Optional
+import cv2
+import numpy as np
+from numpy.typing import NDArray
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import minmax_scale
+def decode_dinov2(
+    features: NDArray, threshold: int | float = -100, interpolation: bool = False, pca: Optional[PCA] = None
+) -> tuple[NDArray, PCA]:
+    """
+    Decode the input `features` in DINOv2 style using PCA.
+    Args:
+        features (NDArray): features to be decoded, should be in shape [batch_size, num_tokens, latent_dim].
+        threshold (int | float): threshold of foreground-background split in PCA visualization.
+            Defaults to -100 (all patches are included).
+        interpolation (bool): whether interpolate the 16x16 pca map to the original image size.
+        pca (Optional[PCA]): if provided, use the provided PCA. This is to keep visualizations stable across samples.
+    Returns:
+        tuple[NDArray, PCA]: the rendered image of this visualization, in NDArray in size
+            [batch_size, height, width, channels] with value ranges [0, 1], and the PCA used in this visualization.
+    """
+    features = features.numpy()
+    batch_size, spatial_size, latent_dim = features.shape
+    h = w = int(spatial_size**0.5)
+    features = features.reshape(-1, latent_dim)
+    if pca is None:
+        pca = PCA(n_components=3)
+        pca.fit(features)
+    pca_features = pca.transform(features)
+    # segment using the first component
+    bg_mask = pca_features[:, 0] < threshold
+    fg_mask = ~bg_mask
+    # PCA for only foreground patches
+    # pca.fit(features[fg_mask])
+    pca_features_fg = pca.transform(features[fg_mask])
+    for i in range(3):
+        pca_features_fg[:, i] = minmax_scale(pca_features_fg[:, i])
+    pca_features_rgb = pca_features.copy()
+    pca_features_rgb[bg_mask] = 0
+    pca_features_rgb[fg_mask] = pca_features_fg
+    pca_features_rgb = pca_features_rgb.reshape(batch_size, h, w, 3)
+    if not interpolation:
+        H = W = 224
+        scale = H // h
+        interpolated_pca_features = np.zeros((batch_size, H, W, 3), dtype=pca_features_rgb.dtype)
+        for i in range(len(pca_features_rgb)):
+            for j in range(h):
+                for k in range(w):
+                    interpolated_pca_features[i, scale * j : scale * (j + 1), scale * k : scale * (k + 1)] = (
+                        pca_features_rgb[i, j, k]
+                    )
+        pca_features_rgb = interpolated_pca_features
+    else:
+        pca_features_rgb = np.stack([cv2.resize(p, (224, 224)) for p in pca_features_rgb])
+    return pca_features_rgb, pca

theia/decoding/sam.py ADDED Viewed

	@@ -0,0 +1,191 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from typing import Any, Generator, Optional
+import numpy as np
+import torch
+from einops import rearrange
+from numpy.typing import NDArray
+from PIL import Image
+from transformers import SamModel, SamProcessor
+from transformers.image_utils import load_image
+from transformers.pipelines import MaskGenerationPipeline
+class MaskGenerationPipelineWithEmbeddings(MaskGenerationPipeline):
+    """
+    The wrapper class for huggingface transformers.pipelines.MaskGenerationPipeline
+        that can decode from intermediate SAM embeddings.
+    """
+    def _sanitize_parameters(self, **kwargs: Any) -> tuple[dict[str, Any], ...]:
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        forward_params = {}
+        # preprocess args
+        if "embeddings" in kwargs:  # inject embeddings here
+            preprocess_kwargs["embeddings"] = kwargs["embeddings"]
+        if "points_per_batch" in kwargs:
+            preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+        if "points_per_crop" in kwargs:
+            preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+        if "crops_n_layers" in kwargs:
+            preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+        if "crop_overlap_ratio" in kwargs:
+            preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+        if "crop_n_points_downscale_factor" in kwargs:
+            preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+        # postprocess args
+        if "pred_iou_thresh" in kwargs:
+            forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+        if "stability_score_offset" in kwargs:
+            forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+        if "mask_threshold" in kwargs:
+            forward_params["mask_threshold"] = kwargs["mask_threshold"]
+        if "stability_score_thresh" in kwargs:
+            forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+        if "crops_nms_thresh" in kwargs:
+            postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+        if "output_rle_mask" in kwargs:
+            postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+        if "output_bboxes_mask" in kwargs:
+            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+        return preprocess_kwargs, forward_params, postprocess_kwargs
+    def preprocess(
+        self,
+        image: list[Image.Image],
+        points_per_batch: int = 64,
+        crops_n_layers: int = 0,
+        crop_overlap_ratio: float = 512 / 1500,
+        points_per_crop: int = 32,
+        crop_n_points_downscale_factor: int = 1,
+        timeout: Optional[float] = None,
+        embeddings: Optional[torch.Tensor] = None,
+    ) -> Generator[Any, Any, Any]:
+        image = load_image(image, timeout=timeout)
+        target_size = self.image_processor.size["longest_edge"]
+        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+        )
+        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        with self.device_placement():
+            if self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    if embeddings is None:
+                        image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+                    else:
+                        model_inputs.pop("pixel_values")
+                        image_embeddings = embeddings
+                    model_inputs["image_embeddings"] = image_embeddings
+        n_points = grid_points.shape[1]
+        points_per_batch = points_per_batch if points_per_batch is not None else n_points
+        if points_per_batch <= 0:
+            raise ValueError(
+                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+                "To return all points at once, set points_per_batch to None"
+            )
+        for i in range(0, n_points, points_per_batch):
+            batched_points = grid_points[:, i : i + points_per_batch, :, :]
+            labels = input_labels[:, i : i + points_per_batch]
+            is_last = i == n_points - points_per_batch
+            yield {
+                "input_points": batched_points,
+                "input_labels": labels,
+                "input_boxes": crop_boxes,
+                "is_last": is_last,
+                **model_inputs,
+            }
+def draw_mask(mask: NDArray, random_color: bool = False) -> NDArray:
+    """Draw the mask on an image.
+    Args:
+        mask (NDArray): mask in shape [height, width].
+        random_color (bool): if using a random color. Defaults to False.
+    Returns:
+        NDArray: NDArray format of the image.
+    """
+    if random_color:
+        color = np.concatenate([np.random.random(3)], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    return mask_image
+def decode_sam(
+    features: torch.Tensor,
+    images: list[Image.Image],
+    mask_generator: Any,
+    points_per_batch: int = 64,
+    pred_iou_thresh: float = 0.5,
+    stability_score_thresh: float = 0.6,
+    random_color: bool = True,
+    device: int | str | torch.device = 0,
+) -> NDArray:
+    """Decode features using SAM (auto-prompting) mask generation pipeline.
+    Args:
+        features (torch.Tensor): features to be decoded, should be in shape [batch_size, num_tokens, latent_dim].
+        images (list[Image.Image]): images corresponding to these features.
+        mask_generator (Any): mask generation pipeline.
+        points_per_batch (int): points per batch for auto-prompting. Defaults to 64.
+            See transformers.pipelines.MaskGenerationPipeline for more details. Same below.
+        pred_iou_thresh (float): iou threshold. Defaults to 0.5.
+        stability_score_thresh (float): stability threshold. Defaults to 0.6.
+        random_color (bool): if using a random color. Defaults to True.
+        device (int | str | torch.device): device to perform the decoding. Defaults to 0.
+    Returns:
+        NDArray: decoded masks rendered in image format, represented by an NDArray in size
+            [batch_size, height, width, channels] with value between [0, 1].
+    """
+    masks_rgbs = []
+    num_patches = int(features.size(1) ** 0.5)
+    features = rearrange(features, "b (h w) c -> b c h w", h=num_patches, w=num_patches)
+    with torch.no_grad():
+        for im, feature in zip(images, features, strict=False):
+            predicted_ouputs = mask_generator(
+                im,
+                points_per_batch=points_per_batch,
+                embeddings=feature.unsqueeze(0).to(device),
+                pred_iou_thresh=pred_iou_thresh,
+                stability_score_thresh=stability_score_thresh,
+            )
+            predicted_masks = predicted_ouputs["masks"]
+            masks_rgb = np.zeros((224, 224, 3), dtype=np.float32)
+            for mask in predicted_masks:
+                masks_rgb += draw_mask(mask, random_color=random_color)
+            # masks_rgb = cv2.cvtColor(masks_rgb, cv2.COLOR_RGBA2RGB)
+            masks_rgbs.append(masks_rgb)
+    return np.stack(masks_rgbs)
+def prepare_mask_generator(device: int | str | torch.device = 0) -> MaskGenerationPipeline:
+    """Prepare a mask generation pipeline on device `device`.
+    Args:
+        device (int | str | torch.device): device to perform mask generation. Defaults to 0.
+    Returns:
+        MaskGenerationPipeline: mask generator.
+    """
+    sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+    processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+    sam_model.eval()
+    mask_generator = MaskGenerationPipelineWithEmbeddings(
+        task="mask_generation", model=sam_model, image_processor=processor.image_processor, device=device
+    )
+    return mask_generator, sam_model

theia/example/decode_to_vfms.ipynb ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import cv2\n",
+    "import torch\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "from transformers import AutoModel\n",
+    "from torchvision.io import read_video, write_video\n",
+    "from theia.decoding import load_feature_stats, prepare_depth_decoder, prepare_mask_generator, decode_everything\n",
+    "\n",
+    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
+    "theia_model = AutoModel.from_pretrained(\"theaiinstitute/theia-base-patch16-224-cdiv\", trust_remote_code=True)\n",
+    "theia_model = theia_model.to(device)\n",
+    "target_model_names = [\n",
+    "    \"google/vit-huge-patch14-224-in21k\",\n",
+    "    \"facebook/dinov2-large\",\n",
+    "    \"openai/clip-vit-large-patch14\",\n",
+    "    \"facebook/sam-vit-huge\",\n",
+    "    \"LiheYoung/depth-anything-large-hf\",\n",
+    "]\n",
+    "feature_means, feature_vars = load_feature_stats(target_model_names, stat_file_root=\"../../../feature_stats\")\n",
+    "\n",
+    "mask_generator, sam_model = prepare_mask_generator(device)\n",
+    "depth_anything_model_name = \"LiheYoung/depth-anything-large-hf\"\n",
+    "depth_anything_decoder, _ = prepare_depth_decoder(depth_anything_model_name, device)\n",
+    "\n",
+    "example_video_path = \"../../../media/example_video_to_visualize.mp4\"\n",
+    "video, _, _ = read_video(example_video_path, pts_unit=\"sec\", output_format=\"THWC\")\n",
+    "video = video.numpy()\n",
+    "images = [Image.fromarray(cv2.resize(im, (224, 224))) for im in video]\n",
+    "\n",
+    "theia_decode_results, gt_decode_results = decode_everything(\n",
+    "    theia_model=theia_model,\n",
+    "    feature_means=feature_means,\n",
+    "    feature_vars=feature_vars,\n",
+    "    images=images,\n",
+    "    mask_generator=mask_generator,\n",
+    "    sam_model=sam_model,\n",
+    "    depth_anything_decoder=depth_anything_decoder,\n",
+    "    pred_iou_thresh=0.5,\n",
+    "    stability_score_thresh=0.7,\n",
+    "    gt=True,\n",
+    "    device=device,\n",
+    ")\n",
+    "\n",
+    "vis_video = np.stack(\n",
+    "    [np.vstack([tr, gtr]) for tr, gtr in zip(theia_decode_results, gt_decode_results, strict=False)]\n",
+    ")\n",
+    "vis_video = torch.from_numpy(vis_video * 255.0).to(torch.uint8)\n",
+    "vis_save_path = \"./visualized.mp4\"\n",
+    "write_video(vis_save_path, vis_video, fps=10)"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

theia/foundation_models/__init__.py ADDED Viewed

	@@ -0,0 +1,9 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+from .vision_language_models.clip import get_clip_feature, get_clip_model
+from .vision_language_models.llava import get_llava_vision_model, get_llava_visual_feature
+from .vision_models.deit import get_deit_feature, get_deit_model
+from .vision_models.depth_anything import get_depth_anything_feature, get_depth_anything_model
+from .vision_models.dinov2 import get_dinov2_feature, get_dinov2_model
+from .vision_models.sam import get_sam_feature, get_sam_model
+from .vision_models.vit import get_vit_feature, get_vit_model

theia/foundation_models/common.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# Copyright (c) 2024 Boston Dynamics AI Institute LLC. All rights reserved.
+import math
+import torch
+MODELS = [
+    "facebook/dinov2-large",
+    "facebook/sam-vit-huge",
+    "google/vit-huge-patch14-224-in21k",
+    "llava-hf/llava-1.5-7b-hf",
+    "openai/clip-vit-large-patch14",
+    "LiheYoung/depth-anything-large-hf",
+]
+# handy model feature size constants
+# in the format of (latent_dim, width, height)
+MODEL_FEATURE_SIZES = {
+    "facebook/dinov2-large": (1024, 16, 16),
+    "facebook/sam-vit-huge": (256, 64, 64),
+    "google/vit-huge-patch14-224-in21k": (1280, 16, 16),
+    "llava-hf/llava-1.5-7b-hf": (1024, 24, 24),
+    "openai/clip-vit-large-patch14": (1024, 16, 16),
+    "LiheYoung/depth-anything-large-hf": (32, 64, 64),
+}
+def get_model_feature_size(
+    model_name: str, keep_spatial: bool = False, return_torch_size: bool = False
+) -> tuple[int, ...] | torch.Size:
+    """
+    Get the size of queried model feature.
+    Args:
+        model_name (str): name of the model.
+        keep_spatial (bool): whether to preserve spatial dim. Defaults to False.
+        return_torch_size (bool): return torch.Size instead of python tuple. Defaults to False.
+    Returns:
+        tuple[int, ...] | torch.Size: the size of the feature.
+    """
+    size: tuple[int, ...] = MODEL_FEATURE_SIZES[model_name]
+    if not keep_spatial:
+        size = (size[0], math.prod(size[1:]))
+    if return_torch_size:
+        size = torch.Size(size)
+    return size
+def get_max_model_spatial_size(
+    keep_spatial: bool = True,
+    return_torch_size: bool = False,
+    return_model_name: bool = False,
+) -> tuple[int, ...] | tuple[tuple[int, ...], str]:
+    """Get the maximal spatial dimensions from available models
+    Args:
+        keep_spatial (bool): whether to preserve spatial dim. Defaults to True.
+        return_torch_size (bool): return torch.Size instead of python tuple. Defaults to False.
+        return_model_name (bool): the name of the model with maximal size. Defaults to False.
+    Returns:
+        tuple[int, ...] | tuple[tuple[int, ...], str]: the maximal size and optional model name.
+    """
+    max_flatten_size = -1
+    max_size: tuple[int, ...] = ()
+    max_size_model_name: str = ""
+    for model, size in MODEL_FEATURE_SIZES.items():
+        flatten_size = math.prod(size[1:])
+        if flatten_size > max_flatten_size:
+            max_flatten_size = flatten_size
+            max_size = size[1:]
+            max_size_model_name = model
+    if not keep_spatial:
+        max_size = (max_flatten_size,)
+    if return_torch_size:
+        max_size = torch.Size(max_size)
+    if return_model_name:
+        return max_size, max_size_model_name
+    else:
+        return max_size