Segment-Anything-Arena

Sleeping

App Files Files Community

pg56714 commited on Jul 15

Commit

a375a27

•

1 Parent(s): d23df68

Upload 110 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

efficient_sam/__init__.py +7 -0
efficient_sam/__pycache__/__init__.cpython-310.pyc +0 -0
efficient_sam/__pycache__/build_efficient_sam.cpython-310.pyc +0 -0
efficient_sam/__pycache__/efficient_sam.cpython-310.pyc +0 -0
efficient_sam/__pycache__/efficient_sam_decoder.cpython-310.pyc +0 -0
efficient_sam/__pycache__/efficient_sam_encoder.cpython-310.pyc +0 -0
efficient_sam/__pycache__/mlp.cpython-310.pyc +0 -0
efficient_sam/__pycache__/two_way_transformer.cpython-310.pyc +0 -0
efficient_sam/build_efficient_sam.py +22 -0
efficient_sam/efficient_sam.py +305 -0
efficient_sam/efficient_sam_decoder.py +315 -0
efficient_sam/efficient_sam_encoder.py +257 -0
efficient_sam/mlp.py +29 -0
efficient_sam/two_way_transformer.py +266 -0
efficientvit/__init__.py +0 -0
efficientvit/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/__pycache__/sam_model_zoo.cpython-310.pyc +0 -0
efficientvit/apps/__init__.py +0 -0
efficientvit/apps/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/__init__.py +7 -0
efficientvit/apps/data_provider/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/__pycache__/base.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/augment/__init__.py +6 -0
efficientvit/apps/data_provider/augment/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/augment/__pycache__/bbox.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/augment/__pycache__/color_aug.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/augment/bbox.py +30 -0
efficientvit/apps/data_provider/augment/color_aug.py +78 -0
efficientvit/apps/data_provider/base.py +199 -0
efficientvit/apps/data_provider/random_resolution/__init__.py +7 -0
efficientvit/apps/data_provider/random_resolution/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/random_resolution/__pycache__/controller.cpython-310.pyc +0 -0
efficientvit/apps/data_provider/random_resolution/_data_loader.py +1538 -0
efficientvit/apps/data_provider/random_resolution/_data_worker.py +358 -0
efficientvit/apps/data_provider/random_resolution/controller.py +92 -0
efficientvit/apps/setup.py +135 -0
efficientvit/apps/trainer/__init__.py +6 -0
efficientvit/apps/trainer/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/trainer/__pycache__/base.cpython-310.pyc +0 -0
efficientvit/apps/trainer/__pycache__/run_config.cpython-310.pyc +0 -0
efficientvit/apps/trainer/base.py +299 -0
efficientvit/apps/trainer/run_config.py +115 -0
efficientvit/apps/utils/__init__.py +12 -0
efficientvit/apps/utils/__pycache__/__init__.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/dist.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/ema.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/export.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/init.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/lr.cpython-310.pyc +0 -0
efficientvit/apps/utils/__pycache__/metric.cpython-310.pyc +0 -0

efficient_sam/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+from .build_efficient_sam import (
+    build_efficient_sam_vitt,
+    build_efficient_sam_vits,
+)

efficient_sam/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (274 Bytes). View file

efficient_sam/__pycache__/build_efficient_sam.cpython-310.pyc ADDED Viewed

Binary file (650 Bytes). View file

efficient_sam/__pycache__/efficient_sam.cpython-310.pyc ADDED Viewed

Binary file (8.13 kB). View file

efficient_sam/__pycache__/efficient_sam_decoder.cpython-310.pyc ADDED Viewed

Binary file (9.79 kB). View file

efficient_sam/__pycache__/efficient_sam_encoder.cpython-310.pyc ADDED Viewed

Binary file (7.34 kB). View file

efficient_sam/__pycache__/mlp.cpython-310.pyc ADDED Viewed

Binary file (1.24 kB). View file

efficient_sam/__pycache__/two_way_transformer.cpython-310.pyc ADDED Viewed

Binary file (7.34 kB). View file

efficient_sam/build_efficient_sam.py ADDED Viewed

	@@ -0,0 +1,22 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .efficient_sam import build_efficient_sam
+def build_efficient_sam_vitt():
+    return build_efficient_sam(
+        encoder_patch_embed_dim=192,
+        encoder_num_heads=3,
+        checkpoint="weights/efficient_sam_vitt.pt",
+    ).eval()
+def build_efficient_sam_vits():
+    return build_efficient_sam(
+        encoder_patch_embed_dim=384,
+        encoder_num_heads=6,
+        checkpoint="weights/efficient_sam_vits.pt",
+    ).eval()

efficient_sam/efficient_sam.py ADDED Viewed

	@@ -0,0 +1,305 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import Any, List, Tuple, Type
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .efficient_sam_decoder import MaskDecoder, PromptEncoder
+from .efficient_sam_encoder import ImageEncoderViT
+from .two_way_transformer import TwoWayAttentionBlock, TwoWayTransformer
+class EfficientSam(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = "RGB"
+    def __init__(
+        self,
+        image_encoder: ImageEncoderViT,
+        prompt_encoder: PromptEncoder,
+        decoder_max_num_input_points: int,
+        mask_decoder: MaskDecoder,
+        pixel_mean: List[float] = [0.485, 0.456, 0.406],
+        pixel_std: List[float] = [0.229, 0.224, 0.225],
+    ) -> None:
+        """
+        SAM predicts object masks from an image and input prompts.
+        Arguments:
+          image_encoder (ImageEncoderViT): The backbone used to encode the
+            image into image embeddings that allow for efficient mask prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the input image.
+        """
+        super().__init__()
+        self.image_encoder = image_encoder
+        self.prompt_encoder = prompt_encoder
+        self.decoder_max_num_input_points = decoder_max_num_input_points
+        self.mask_decoder = mask_decoder
+        self.register_buffer(
+            "pixel_mean", torch.Tensor(pixel_mean).view(1, 3, 1, 1), False
+        )
+        self.register_buffer(
+            "pixel_std", torch.Tensor(pixel_std).view(1, 3, 1, 1), False
+        )
+    @torch.jit.export
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        multimask_output: bool,
+        input_h: int,
+        input_w: int,
+        output_h: int = -1,
+        output_w: int = -1,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks given image embeddings and prompts. This only runs the decoder.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          batched_points: A tensor of shape [B, max_num_queries, num_pts, 2]
+          batched_point_labels: A tensor of shape [B, max_num_queries, num_pts]
+        Returns:
+          A tuple of two tensors:
+            low_res_mask: A tensor of shape [B, max_num_queries, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, max_num_queries, num_pts, _ = batched_points.shape
+        num_pts = batched_points.shape[2]
+        rescaled_batched_points = self.get_rescaled_pts(batched_points, input_h, input_w)
+        if num_pts > self.decoder_max_num_input_points:
+            rescaled_batched_points = rescaled_batched_points[
+                :, :, : self.decoder_max_num_input_points, :
+            ]
+            batched_point_labels = batched_point_labels[
+                :, :, : self.decoder_max_num_input_points
+            ]
+        elif num_pts < self.decoder_max_num_input_points:
+            rescaled_batched_points = F.pad(
+                rescaled_batched_points,
+                (0, 0, 0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+            batched_point_labels = F.pad(
+                batched_point_labels,
+                (0, self.decoder_max_num_input_points - num_pts),
+                value=-1.0,
+            )
+        sparse_embeddings = self.prompt_encoder(
+            rescaled_batched_points.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points, 2
+            ),
+            batched_point_labels.reshape(
+                batch_size * max_num_queries, self.decoder_max_num_input_points
+            ),
+        )
+        sparse_embeddings = sparse_embeddings.view(
+            batch_size,
+            max_num_queries,
+            sparse_embeddings.shape[1],
+            sparse_embeddings.shape[2],
+        )
+        low_res_masks, iou_predictions = self.mask_decoder(
+            image_embeddings,
+            self.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            multimask_output=multimask_output,
+        )
+        _, num_predictions, low_res_size, _ = low_res_masks.shape
+        if output_w > 0 and output_h > 0:
+            output_masks = F.interpolate(
+                low_res_masks, (output_h, output_w), mode="bicubic"
+            )
+            output_masks = torch.reshape(
+                output_masks,
+                (batch_size, max_num_queries, num_predictions, output_h, output_w),
+            )
+        else:
+            output_masks = torch.reshape(
+                low_res_masks,
+                (
+                    batch_size,
+                    max_num_queries,
+                    num_predictions,
+                    low_res_size,
+                    low_res_size,
+                ),
+            )
+        iou_predictions = torch.reshape(
+            iou_predictions, (batch_size, max_num_queries, num_predictions)
+        )
+        return output_masks, iou_predictions
+    def get_rescaled_pts(self, batched_points: torch.Tensor, input_h: int, input_w: int):
+        return torch.stack(
+            [
+                torch.where(
+                    batched_points[..., 0] >= 0,
+                    batched_points[..., 0] * self.image_encoder.img_size / input_w,
+                    -1.0,
+                ),
+                torch.where(
+                    batched_points[..., 1] >= 0,
+                    batched_points[..., 1] * self.image_encoder.img_size / input_h,
+                    -1.0,
+                ),
+            ],
+            dim=-1,
+        )
+    @torch.jit.export
+    def get_image_embeddings(self, batched_images) -> torch.Tensor:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+        Returns:
+          List of image embeddings each of of shape [B, C(i), H(i), W(i)].
+          The last embedding corresponds to the final layer.
+        """
+        batched_images = self.preprocess(batched_images)
+        return self.image_encoder(batched_images)
+    def forward(
+        self,
+        batched_images: torch.Tensor,
+        batched_points: torch.Tensor,
+        batched_point_labels: torch.Tensor,
+        scale_to_original_image_size: bool = True,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predicts masks end-to-end from provided images and prompts.
+        If prompts are not known in advance, using SamPredictor is
+        recommended over calling the model directly.
+        Arguments:
+          batched_images: A tensor of shape [B, 3, H, W]
+          batched_points: A tensor of shape [B, num_queries, max_num_pts, 2]
+          batched_point_labels: A tensor of shape [B, num_queries, max_num_pts]
+        Returns:
+          A list tuples of two tensors where the ith element is by considering the first i+1 points.
+            low_res_mask: A tensor of shape [B, 256, 256] of predicted masks
+            iou_predictions: A tensor of shape [B, max_num_queries] of estimated IOU scores
+        """
+        batch_size, _, input_h, input_w = batched_images.shape
+        image_embeddings = self.get_image_embeddings(batched_images)
+        return self.predict_masks(
+            image_embeddings,
+            batched_points,
+            batched_point_labels,
+            multimask_output=True,
+            input_h=input_h,
+            input_w=input_w,
+            output_h=input_h if scale_to_original_image_size else -1,
+            output_w=input_w if scale_to_original_image_size else -1,
+        )
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        if (
+            x.shape[2] != self.image_encoder.img_size
+            or x.shape[3] != self.image_encoder.img_size
+        ):
+            x = F.interpolate(
+                x,
+                (self.image_encoder.img_size, self.image_encoder.img_size),
+                mode="bilinear",
+            )
+        return (x - self.pixel_mean) / self.pixel_std
+def build_efficient_sam(encoder_patch_embed_dim, encoder_num_heads, checkpoint=None):
+    img_size = 1024
+    encoder_patch_size = 16
+    encoder_depth = 12
+    encoder_mlp_ratio = 4.0
+    encoder_neck_dims = [256, 256]
+    decoder_max_num_input_points = 6
+    decoder_transformer_depth = 2
+    decoder_transformer_mlp_dim = 2048
+    decoder_num_heads = 8
+    decoder_upscaling_layer_dims = [64, 32]
+    num_multimask_outputs = 3
+    iou_head_depth = 3
+    iou_head_hidden_dim = 256
+    activation = "gelu"
+    normalization_type = "layer_norm"
+    normalize_before_activation = False
+    assert activation == "relu" or activation == "gelu"
+    if activation == "relu":
+        activation_fn = nn.ReLU
+    else:
+        activation_fn = nn.GELU
+    image_encoder = ImageEncoderViT(
+        img_size=img_size,
+        patch_size=encoder_patch_size,
+        in_chans=3,
+        patch_embed_dim=encoder_patch_embed_dim,
+        normalization_type=normalization_type,
+        depth=encoder_depth,
+        num_heads=encoder_num_heads,
+        mlp_ratio=encoder_mlp_ratio,
+        neck_dims=encoder_neck_dims,
+        act_layer=activation_fn,
+    )
+    image_embedding_size = image_encoder.image_embedding_size
+    encoder_transformer_output_dim = image_encoder.transformer_output_dim
+    sam = EfficientSam(
+        image_encoder=image_encoder,
+        prompt_encoder=PromptEncoder(
+            embed_dim=encoder_transformer_output_dim,
+            image_embedding_size=(image_embedding_size, image_embedding_size),
+            input_image_size=(img_size, img_size),
+        ),
+        decoder_max_num_input_points=decoder_max_num_input_points,
+        mask_decoder=MaskDecoder(
+            transformer_dim=encoder_transformer_output_dim,
+            transformer=TwoWayTransformer(
+                depth=decoder_transformer_depth,
+                embedding_dim=encoder_transformer_output_dim,
+                num_heads=decoder_num_heads,
+                mlp_dim=decoder_transformer_mlp_dim,
+                activation=activation_fn,
+                normalize_before_activation=normalize_before_activation,
+            ),
+            num_multimask_outputs=num_multimask_outputs,
+            activation=activation_fn,
+            normalization_type=normalization_type,
+            normalize_before_activation=normalize_before_activation,
+            iou_head_depth=iou_head_depth - 1,
+            iou_head_hidden_dim=iou_head_hidden_dim,
+            upscaling_layer_dims=decoder_upscaling_layer_dims,
+        ),
+        pixel_mean=[0.485, 0.456, 0.406],
+        pixel_std=[0.229, 0.224, 0.225],
+    )
+    if checkpoint is not None:
+        with open(checkpoint, "rb") as f:
+            state_dict = torch.load(f, map_location="cpu")
+        sam.load_state_dict(state_dict["model"])
+    return sam

efficient_sam/efficient_sam_decoder.py ADDED Viewed

	@@ -0,0 +1,315 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Tuple, Type
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .mlp import MLPBlock
+class PromptEncoder(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+    ) -> None:
+        """
+        Encodes prompts for input to SAM's mask decoder.
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+        self.invalid_points = nn.Embedding(1, embed_dim)
+        self.point_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_top_left_embeddings = nn.Embedding(1, embed_dim)
+        self.bbox_bottom_right_embeddings = nn.Embedding(1, embed_dim)
+    def get_dense_pe(self) -> torch.Tensor:
+        """
+        Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size
+        )
+        invalid_label_ids = torch.eq(labels, -1)[:,:,None]
+        point_label_ids = torch.eq(labels, 1)[:,:,None]
+        topleft_label_ids = torch.eq(labels, 2)[:,:,None]
+        bottomright_label_ids = torch.eq(labels, 3)[:,:,None]
+        point_embedding = point_embedding + self.invalid_points.weight[:,None,:] * invalid_label_ids
+        point_embedding = point_embedding + self.point_embeddings.weight[:,None,:] * point_label_ids
+        point_embedding = point_embedding + self.bbox_top_left_embeddings.weight[:,None,:] * topleft_label_ids
+        point_embedding = point_embedding + self.bbox_bottom_right_embeddings.weight[:,None,:] * bottomright_label_ids
+        return point_embedding
+    def forward(
+        self,
+        coords,
+        labels,
+    ) -> torch.Tensor:
+        """
+        Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+        Arguments:
+          points: A tensor of shape [B, 2]
+          labels: An integer tensor of shape [B] where each element is 1,2 or 3.
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+        """
+        return self._embed_points(coords, labels)
+class PositionEmbeddingRandom(nn.Module):
+    """
+    Positional encoding using random spatial frequencies.
+    """
+    def __init__(self, num_pos_feats: int) -> None:
+        super().__init__()
+        self.register_buffer(
+            "positional_encoding_gaussian_matrix", torch.randn((2, num_pos_feats))
+        )
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones([h, w], device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+    def forward_with_coords(
+        self, coords_input: torch.Tensor, image_size: Tuple[int, int]
+    ) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
+class MaskDecoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: nn.Module,
+        num_multimask_outputs: int,
+        activation: Type[nn.Module],
+        normalization_type: str,
+        normalize_before_activation: bool,
+        iou_head_depth: int,
+        iou_head_hidden_dim: int,
+        upscaling_layer_dims: List[int],
+    ) -> None:
+        """
+        Predicts masks given an image and prompt embeddings, using a
+        transformer architecture.
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = transformer
+        self.num_multimask_outputs = num_multimask_outputs
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        if num_multimask_outputs > 1:
+            self.num_mask_tokens = num_multimask_outputs + 1
+        else:
+            self.num_mask_tokens = 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+        output_dim_after_upscaling = transformer_dim
+        self.final_output_upscaling_layers = nn.ModuleList([])
+        for idx, layer_dims in enumerate(upscaling_layer_dims):
+            self.final_output_upscaling_layers.append(
+                nn.Sequential(
+                    nn.ConvTranspose2d(
+                        output_dim_after_upscaling,
+                        layer_dims,
+                        kernel_size=2,
+                        stride=2,
+                    ),
+                    nn.GroupNorm(1, layer_dims)
+                    if idx < len(upscaling_layer_dims) - 1
+                    else nn.Identity(),
+                    activation(),
+                )
+            )
+            output_dim_after_upscaling = layer_dims
+        self.output_hypernetworks_mlps = nn.ModuleList(
+            [
+                MLPBlock(
+                    input_dim=transformer_dim,
+                    hidden_dim=transformer_dim,
+                    output_dim=output_dim_after_upscaling,
+                    num_layers=2,
+                    act=activation,
+                )
+                for i in range(self.num_mask_tokens)
+            ]
+        )
+        self.iou_prediction_head = MLPBlock(
+            input_dim=transformer_dim,
+            hidden_dim=iou_head_hidden_dim,
+            output_dim=self.num_mask_tokens,
+            num_layers=iou_head_depth,
+            act=activation,
+        )
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+        multimask_output: bool,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Predict masks given image and prompt embeddings.
+        Arguments:
+          image_embeddings: A tensor of shape [B, C, H, W] or [B*max_num_queries, C, H, W]
+          image_pe (torch.Tensor): positional encoding with the shape of image_embeddings (the batch dimension is broadcastable).
+          sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+        Returns:
+          torch.Tensor: batched predicted masks
+          torch.Tensor: batched predictions of mask quality
+        """
+        (
+            batch_size,
+            max_num_queries,
+            sparse_embed_dim_1,
+            sparse_embed_dim_2,
+        ) = sparse_prompt_embeddings.shape
+        (
+            _,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        ) = image_embeddings.shape
+        # Tile the image embedding for all queries.
+        image_embeddings_tiled = torch.tile(
+            image_embeddings[:, None, :, :, :], [1, max_num_queries, 1, 1, 1]
+        ).view(
+            batch_size * max_num_queries,
+            image_embed_dim_c,
+            image_embed_dim_h,
+            image_embed_dim_w,
+        )
+        sparse_prompt_embeddings = sparse_prompt_embeddings.reshape(
+            batch_size * max_num_queries, sparse_embed_dim_1, sparse_embed_dim_2
+        )
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings_tiled,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+        )
+        if multimask_output and self.num_multimask_outputs > 1:
+            return masks[:, 1:, :], iou_pred[:, 1:]
+        else:
+            return masks[:, :1, :], iou_pred[:, :1]
+    def predict_masks(
+        self,
+        image_embeddings: torch.Tensor,
+        image_pe: torch.Tensor,
+        sparse_prompt_embeddings: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Predicts masks. See 'forward' for more details."""
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0
+        )
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1
+        )
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+        # Expand per-image data in batch direction to be per-mask
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = image_embeddings.shape
+        hs, src = self.transformer(image_embeddings, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :]
+        # Upscale mask embeddings and predict masks using the mask tokens
+        upscaled_embedding = src.transpose(1, 2).view(b, c, h, w)
+        for upscaling_layer in self.final_output_upscaling_layers:
+            upscaled_embedding = upscaling_layer(upscaled_embedding)
+        hyper_in_list: List[torch.Tensor] = []
+        for i, output_hypernetworks_mlp in enumerate(self.output_hypernetworks_mlps):
+            hyper_in_list.append(output_hypernetworks_mlp(mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w)
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+        return masks, iou_pred

efficient_sam/efficient_sam_encoder.py ADDED Viewed

	@@ -0,0 +1,257 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+import math
+from typing import List, Optional, Tuple, Type
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class LayerNorm2d(nn.Module):
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+class PatchEmbed(nn.Module):
+    """2D Image to Patch Embedding"""
+    def __init__(
+        self,
+        img_size,
+        patch_size,
+        in_chans,
+        embed_dim,
+    ):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            in_chans,
+            embed_dim,
+            kernel_size=(patch_size, patch_size),
+            stride=(patch_size, patch_size),
+            bias=True,
+        )
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x = self.proj(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        qkv_bias,
+        qk_scale=None,
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim)
+    def forward(self, x):
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = (
+            qkv[0],
+            qkv[1],
+            qkv[2],
+        )
+        attn = (q @ k.transpose(-2, -1)) * self.scale
+        attn = attn.softmax(dim=-1)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        return x
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        hidden_features=None,
+        out_features=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.fc2(x)
+        return x
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads,
+        mlp_ratio=4.0,
+        qkv_bias=False,
+        qk_scale=None,
+        act_layer=nn.GELU,
+    ):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(dim, eps=1e-6)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+        )
+        self.norm2 = nn.LayerNorm(dim, eps=1e-6)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+        )
+    def forward(self, x):
+        x = x + self.attn(self.norm1(x))
+        x = x + self.mlp(self.norm2(x))
+        return x
+@torch.jit.export
+def get_abs_pos(
+    abs_pos: torch.Tensor, has_cls_token: bool, hw: List[int]
+) -> torch.Tensor:
+    """
+    Calculate absolute positional embeddings. If needed, resize embeddings and remove cls_token
+        dimension for the original embeddings.
+    Args:
+        abs_pos (Tensor): absolute positional embeddings with (1, num_position, C).
+        has_cls_token (bool): If true, has 1 embedding in abs_pos for cls token.
+        hw (Tuple): size of input image tokens.
+    Returns:
+        Absolute positional embeddings after processing with shape (1, H, W, C)
+    """
+    h = hw[0]
+    w = hw[1]
+    if has_cls_token:
+        abs_pos = abs_pos[:, 1:]
+    xy_num = abs_pos.shape[1]
+    size = int(math.sqrt(xy_num))
+    assert size * size == xy_num
+    if size != h or size != w:
+        new_abs_pos = F.interpolate(
+            abs_pos.reshape(1, size, size, -1).permute(0, 3, 1, 2),
+            size=(h, w),
+            mode="bicubic",
+            align_corners=False,
+        )
+        return new_abs_pos.permute(0, 2, 3, 1)
+    else:
+        return abs_pos.reshape(1, h, w, -1)
+# Image encoder for efficient SAM.
+class ImageEncoderViT(nn.Module):
+    def __init__(
+        self,
+        img_size: int,
+        patch_size: int,
+        in_chans: int,
+        patch_embed_dim: int,
+        normalization_type: str,
+        depth: int,
+        num_heads: int,
+        mlp_ratio: float,
+        neck_dims: List[int],
+        act_layer: Type[nn.Module],
+    ) -> None:
+        """
+        Args:
+            img_size (int): Input image size.
+            patch_size (int): Patch size.
+            in_chans (int): Number of input image channels.
+            patch_embed_dim (int): Patch embedding dimension.
+            depth (int): Depth of ViT.
+            num_heads (int): Number of attention heads in each ViT block.
+            mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+            act_layer (nn.Module): Activation layer.
+        """
+        super().__init__()
+        self.img_size = img_size
+        self.image_embedding_size = img_size // ((patch_size if patch_size > 0 else 1))
+        self.transformer_output_dim = ([patch_embed_dim] + neck_dims)[-1]
+        self.pretrain_use_cls_token = True
+        pretrain_img_size = 224
+        self.patch_embed = PatchEmbed(img_size, patch_size, in_chans, patch_embed_dim)
+        # Initialize absolute positional embedding with pretrain image size.
+        num_patches = (pretrain_img_size // patch_size) * (
+            pretrain_img_size // patch_size
+        )
+        num_positions = num_patches + 1
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_positions, patch_embed_dim))
+        self.blocks = nn.ModuleList()
+        for i in range(depth):
+            vit_block = Block(patch_embed_dim, num_heads, mlp_ratio, True)
+            self.blocks.append(vit_block)
+        self.neck = nn.Sequential(
+            nn.Conv2d(
+                patch_embed_dim,
+                neck_dims[0],
+                kernel_size=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+            nn.Conv2d(
+                neck_dims[0],
+                neck_dims[0],
+                kernel_size=3,
+                padding=1,
+                bias=False,
+            ),
+            LayerNorm2d(neck_dims[0]),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        assert (
+            x.shape[2] == self.img_size and x.shape[3] == self.img_size
+        ), "input image size must match self.img_size"
+        x = self.patch_embed(x)
+        # B C H W -> B H W C
+        x = x.permute(0, 2, 3, 1)
+        x = x + get_abs_pos(
+            self.pos_embed, self.pretrain_use_cls_token, [x.shape[1], x.shape[2]]
+        )
+        num_patches = x.shape[1]
+        assert x.shape[2] == num_patches
+        x = x.reshape(x.shape[0], num_patches * num_patches, x.shape[3])
+        for blk in self.blocks:
+            x = blk(x)
+        x = x.reshape(x.shape[0], num_patches, num_patches, x.shape[2])
+        x = self.neck(x.permute(0, 3, 1, 2))
+        return x

efficient_sam/mlp.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from typing import Type
+from torch import nn
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLPBlock(nn.Module):
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        act: Type[nn.Module],
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Sequential(nn.Linear(n, k), act())
+            for n, k in zip([input_dim] + h, [hidden_dim] * num_layers)
+        )
+        self.fc = nn.Linear(hidden_dim, output_dim)
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return self.fc(x)

efficient_sam/two_way_transformer.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import math
+from typing import Tuple, Type
+import torch
+from torch import nn, Tensor
+from .mlp import MLPBlock
+class TwoWayTransformer(nn.Module):
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """
+        A transformer decoder that attends to an input image using
+        queries whose positional embedding is supplied.
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+        for i in range(depth):
+            curr_layer = TwoWayAttentionBlock(
+                embedding_dim=embedding_dim,
+                num_heads=num_heads,
+                mlp_dim=mlp_dim,
+                activation=activation,
+                normalize_before_activation=normalize_before_activation,
+                attention_downsample_rate=attention_downsample_rate,
+                skip_first_layer_pe=(i == 0),
+            )
+            self.layers.append(curr_layer)
+        self.final_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+        # Apply transformer blocks and final layernorm
+        for idx, layer in enumerate(self.layers):
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+        # Apply the final attention layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+        return queries, keys
+class TwoWayAttentionBlock(nn.Module):
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module],
+        normalize_before_activation: bool,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """
+        A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to sparse
+        inputs.
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = AttentionForTwoWayAttentionBlock(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_token_to_image = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.norm2 = nn.LayerNorm(embedding_dim)
+        self.mlp = MLPBlock(
+            embedding_dim,
+            mlp_dim,
+            embedding_dim,
+            1,
+            activation,
+        )
+        self.norm3 = nn.LayerNorm(embedding_dim)
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = AttentionForTwoWayAttentionBlock(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate,
+        )
+        self.skip_first_layer_pe = skip_first_layer_pe
+    def forward(
+        self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor
+    ) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if not self.skip_first_layer_pe:
+            queries = queries + query_pe
+        attn_out = self.self_attn(q=queries, k=queries, v=queries)
+        queries = queries + attn_out
+        queries = self.norm1(queries)
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+        return queries, keys
+class AttentionForTwoWayAttentionBlock(nn.Module):
+    """
+    An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values.
+    """
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert (
+            self.internal_dim % num_heads == 0
+        ), "num_heads must divide embedding_dim."
+        self.c_per_head = self.internal_dim / num_heads
+        self.inv_sqrt_c_per_head = 1.0 / math.sqrt(self.c_per_head)
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+        self._reset_parameters()
+    def _reset_parameters(self) -> None:
+        # The fan_out is incorrect, but matches pytorch's initialization
+        # for which qkv is a single 3*embedding_dim x embedding_dim matrix
+        fan_in = self.embedding_dim
+        fan_out = 3 * self.internal_dim
+        # Xavier uniform with our custom fan_out
+        bnd = math.sqrt(6 / (fan_in + fan_out))
+        nn.init.uniform_(self.q_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.k_proj.weight, -bnd, bnd)
+        nn.init.uniform_(self.v_proj.weight, -bnd, bnd)
+        # out_proj.weight is left with default initialization, like pytorch attention
+        nn.init.zeros_(self.q_proj.bias)
+        nn.init.zeros_(self.k_proj.bias)
+        nn.init.zeros_(self.v_proj.bias)
+        nn.init.zeros_(self.out_proj.bias)
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn * self.inv_sqrt_c_per_head
+        attn = torch.softmax(attn, dim=-1)
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+        return out

efficientvit/__init__.py ADDED Viewed

File without changes

efficientvit/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (170 Bytes). View file

efficientvit/__pycache__/sam_model_zoo.cpython-310.pyc ADDED Viewed

Binary file (1.46 kB). View file

efficientvit/apps/__init__.py ADDED Viewed

File without changes

efficientvit/apps/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (175 Bytes). View file

efficientvit/apps/data_provider/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+from .augment import *
+from .base import *
+from .random_resolution import *

efficientvit/apps/data_provider/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (258 Bytes). View file

efficientvit/apps/data_provider/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (6.34 kB). View file

efficientvit/apps/data_provider/augment/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+from .bbox import *
+from .color_aug import *

efficientvit/apps/data_provider/augment/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (239 Bytes). View file

efficientvit/apps/data_provider/augment/__pycache__/bbox.cpython-310.pyc ADDED Viewed

Binary file (802 Bytes). View file

efficientvit/apps/data_provider/augment/__pycache__/color_aug.cpython-310.pyc ADDED Viewed

Binary file (3.13 kB). View file

efficientvit/apps/data_provider/augment/bbox.py ADDED Viewed

	@@ -0,0 +1,30 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import numpy as np
+__all__ = ["rand_bbox"]
+def rand_bbox(
+    h: int,
+    w: int,
+    lam: float,
+    rand_func: callable = np.random.uniform,
+) -> tuple[int, int, int, int]:
+    """randomly sample bbox, used in cutmix"""
+    cut_rat = np.sqrt(1.0 - lam)
+    cut_w = w * cut_rat
+    cut_h = h * cut_rat
+    # uniform
+    cx = rand_func(0, w)
+    cy = rand_func(0, h)
+    bbx1 = int(np.clip(cx - cut_w / 2, 0, w))
+    bby1 = int(np.clip(cy - cut_h / 2, 0, h))
+    bbx2 = int(np.clip(cx + cut_w / 2, 0, w))
+    bby2 = int(np.clip(cy + cut_h / 2, 0, h))
+    return bbx1, bby1, bbx2, bby2

efficientvit/apps/data_provider/augment/color_aug.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import numpy as np
+import torchvision.transforms as transforms
+from PIL import Image
+from timm.data.auto_augment import rand_augment_transform
+__all__ = ["ColorAug", "RandAug"]
+class ImageAug:
+    def aug_image(self, image: Image.Image) -> Image.Image:
+        raise NotImplementedError
+    def __call__(self, feed_dict: dict or np.ndarray or Image.Image) -> dict or np.ndarray or Image.Image:
+        if isinstance(feed_dict, dict):
+            output_dict = feed_dict
+            image = feed_dict[self.key]
+        else:
+            output_dict = None
+            image = feed_dict
+        is_ndarray = isinstance(image, np.ndarray)
+        if is_ndarray:
+            image = Image.fromarray(image)
+        image = self.aug_image(image)
+        if is_ndarray:
+            image = np.array(image)
+        if output_dict is None:
+            return image
+        else:
+            output_dict[self.key] = image
+            return output_dict
+class ColorAug(transforms.ColorJitter, ImageAug):
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0, key="data"):
+        super().__init__(
+            brightness=brightness,
+            contrast=contrast,
+            saturation=saturation,
+            hue=hue,
+        )
+        self.key = key
+    def aug_image(self, image: Image.Image) -> Image.Image:
+        return transforms.ColorJitter.forward(self, image)
+    def forward(self, feed_dict: dict or np.ndarray or Image.Image) -> dict or np.ndarray or Image.Image:
+        return ImageAug.__call__(self, feed_dict)
+class RandAug(ImageAug):
+    def __init__(self, config: dict[str, any], mean: tuple[float, float, float], key="data"):
+        n = config.get("n", 2)
+        m = config.get("m", 9)
+        mstd = config.get("mstd", 1.0)
+        inc = config.get("inc", 1)
+        tpct = config.get("tpct", 0.45)
+        config_str = f"rand-n{n}-m{m}-mstd{mstd}-inc{inc}"
+        aa_params = dict(
+            translate_pct=tpct,
+            img_mean=tuple([min(255, round(255 * x)) for x in mean]),
+            interpolation=Image.BICUBIC,
+        )
+        self.aug_op = rand_augment_transform(config_str, aa_params)
+        self.key = key
+    def aug_image(self, image: Image.Image) -> Image.Image:
+        return self.aug_op(image)
+    def __repr__(self):
+        return self.aug_op.__repr__()

efficientvit/apps/data_provider/base.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import copy
+import warnings
+import torch.utils.data
+from torch.utils.data.distributed import DistributedSampler
+from efficientvit.apps.data_provider.random_resolution import RRSController
+from efficientvit.models.utils import val2tuple
+__all__ = ["parse_image_size", "random_drop_data", "DataProvider"]
+def parse_image_size(size: int or str) -> tuple[int, int]:
+    if isinstance(size, str):
+        size = [int(val) for val in size.split("-")]
+        return size[0], size[1]
+    else:
+        return val2tuple(size, 2)
+def random_drop_data(dataset, drop_size: int, seed: int, keys=("samples",)):
+    g = torch.Generator()
+    g.manual_seed(seed)  # set random seed before sampling validation set
+    rand_indexes = torch.randperm(len(dataset), generator=g).tolist()
+    dropped_indexes = rand_indexes[:drop_size]
+    remaining_indexes = rand_indexes[drop_size:]
+    dropped_dataset = copy.deepcopy(dataset)
+    for key in keys:
+        setattr(dropped_dataset, key, [getattr(dropped_dataset, key)[idx] for idx in dropped_indexes])
+        setattr(dataset, key, [getattr(dataset, key)[idx] for idx in remaining_indexes])
+    return dataset, dropped_dataset
+class DataProvider:
+    data_keys = ("samples",)
+    mean_std = {"mean": [0.485, 0.456, 0.406], "std": [0.229, 0.224, 0.225]}
+    SUB_SEED = 937162211  # random seed for sampling subset
+    VALID_SEED = 2147483647  # random seed for the validation set
+    name: str
+    def __init__(
+        self,
+        train_batch_size: int,
+        test_batch_size: int or None,
+        valid_size: int or float or None,
+        n_worker: int,
+        image_size: int or list[int] or str or list[str],
+        num_replicas: int or None = None,
+        rank: int or None = None,
+        train_ratio: float or None = None,
+        drop_last: bool = False,
+    ):
+        warnings.filterwarnings("ignore")
+        super().__init__()
+        # batch_size & valid_size
+        self.train_batch_size = train_batch_size
+        self.test_batch_size = test_batch_size or self.train_batch_size
+        self.valid_size = valid_size
+        # image size
+        if isinstance(image_size, list):
+            self.image_size = [parse_image_size(size) for size in image_size]
+            self.image_size.sort()  # e.g., 160 -> 224
+            RRSController.IMAGE_SIZE_LIST = copy.deepcopy(self.image_size)
+            self.active_image_size = RRSController.ACTIVE_SIZE = self.image_size[-1]
+        else:
+            self.image_size = parse_image_size(image_size)
+            RRSController.IMAGE_SIZE_LIST = [self.image_size]
+            self.active_image_size = RRSController.ACTIVE_SIZE = self.image_size
+        # distributed configs
+        self.num_replicas = num_replicas
+        self.rank = rank
+        # build datasets
+        train_dataset, val_dataset, test_dataset = self.build_datasets()
+        if train_ratio is not None and train_ratio < 1.0:
+            assert 0 < train_ratio < 1
+            _, train_dataset = random_drop_data(
+                train_dataset,
+                int(train_ratio * len(train_dataset)),
+                self.SUB_SEED,
+                self.data_keys,
+            )
+        # build data loader
+        self.train = self.build_dataloader(train_dataset, train_batch_size, n_worker, drop_last=drop_last, train=True)
+        self.valid = self.build_dataloader(val_dataset, test_batch_size, n_worker, drop_last=False, train=False)
+        self.test = self.build_dataloader(test_dataset, test_batch_size, n_worker, drop_last=False, train=False)
+        if self.valid is None:
+            self.valid = self.test
+        self.sub_train = None
+    @property
+    def data_shape(self) -> tuple[int, ...]:
+        return 3, self.active_image_size[0], self.active_image_size[1]
+    def build_valid_transform(self, image_size: tuple[int, int] or None = None) -> any:
+        raise NotImplementedError
+    def build_train_transform(self, image_size: tuple[int, int] or None = None) -> any:
+        raise NotImplementedError
+    def build_datasets(self) -> tuple[any, any, any]:
+        raise NotImplementedError
+    def build_dataloader(self, dataset: any or None, batch_size: int, n_worker: int, drop_last: bool, train: bool):
+        if dataset is None:
+            return None
+        if isinstance(self.image_size, list) and train:
+            from efficientvit.apps.data_provider.random_resolution._data_loader import RRSDataLoader
+            dataloader_class = RRSDataLoader
+        else:
+            dataloader_class = torch.utils.data.DataLoader
+        if self.num_replicas is None:
+            return dataloader_class(
+                dataset=dataset,
+                batch_size=batch_size,
+                shuffle=True,
+                num_workers=n_worker,
+                pin_memory=True,
+                drop_last=drop_last,
+            )
+        else:
+            sampler = DistributedSampler(dataset, self.num_replicas, self.rank)
+            return dataloader_class(
+                dataset=dataset,
+                batch_size=batch_size,
+                sampler=sampler,
+                num_workers=n_worker,
+                pin_memory=True,
+                drop_last=drop_last,
+            )
+    def set_epoch(self, epoch: int) -> None:
+        RRSController.set_epoch(epoch, len(self.train))
+        if isinstance(self.train.sampler, DistributedSampler):
+            self.train.sampler.set_epoch(epoch)
+    def assign_active_image_size(self, new_size: int or tuple[int, int]) -> None:
+        self.active_image_size = val2tuple(new_size, 2)
+        new_transform = self.build_valid_transform(self.active_image_size)
+        # change the transform of the valid and test set
+        self.valid.dataset.transform = self.test.dataset.transform = new_transform
+    def sample_val_dataset(self, train_dataset, valid_transform) -> tuple[any, any]:
+        if self.valid_size is not None:
+            if 0 < self.valid_size < 1:
+                valid_size = int(self.valid_size * len(train_dataset))
+            else:
+                assert self.valid_size >= 1
+                valid_size = int(self.valid_size)
+            train_dataset, val_dataset = random_drop_data(
+                train_dataset,
+                valid_size,
+                self.VALID_SEED,
+                self.data_keys,
+            )
+            val_dataset.transform = valid_transform
+        else:
+            val_dataset = None
+        return train_dataset, val_dataset
+    def build_sub_train_loader(self, n_samples: int, batch_size: int) -> any:
+        # used for resetting BN running statistics
+        if self.sub_train is None:
+            self.sub_train = {}
+        if self.active_image_size in self.sub_train:
+            return self.sub_train[self.active_image_size]
+        # construct dataset and dataloader
+        train_dataset = copy.deepcopy(self.train.dataset)
+        if n_samples < len(train_dataset):
+            _, train_dataset = random_drop_data(
+                train_dataset,
+                n_samples,
+                self.SUB_SEED,
+                self.data_keys,
+            )
+        RRSController.ACTIVE_SIZE = self.active_image_size
+        train_dataset.transform = self.build_train_transform(image_size=self.active_image_size)
+        data_loader = self.build_dataloader(train_dataset, batch_size, self.train.num_workers, True, False)
+        # pre-fetch data
+        self.sub_train[self.active_image_size] = [
+            data for data in data_loader for _ in range(max(1, n_samples // len(train_dataset)))
+        ]
+        return self.sub_train[self.active_image_size]

efficientvit/apps/data_provider/random_resolution/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""Random resolution data loader compatible with multi-processing and distributed training.
+Replace Pytorch's DataLoader with RRSDataLoader to support random resolution
+at the training time, resolution sampling is controlled by RRSController
+"""
+from .controller import *

efficientvit/apps/data_provider/random_resolution/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (491 Bytes). View file

efficientvit/apps/data_provider/random_resolution/__pycache__/controller.cpython-310.pyc ADDED Viewed

Binary file (3.31 kB). View file

efficientvit/apps/data_provider/random_resolution/_data_loader.py ADDED Viewed

	@@ -0,0 +1,1538 @@

+r"""This file is based on torch/utils/data/data_loader.py
+Definition of the DataLoader and associated iterators that subclass _BaseDataLoaderIter
+To support these two classes, in `./_utils` we define many utility methods and
+functions to be run in multiprocessing. E.g., the data loading worker loop is
+in `./_utils/worker.py`.
+"""
+import functools
+import itertools
+import logging
+import multiprocessing as python_multiprocessing
+import os
+import queue
+import threading
+import warnings
+from typing import Any, Callable, Generic, Iterable, List, Optional, Sequence, TypeVar, Union
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as multiprocessing
+import torch.utils.data.graph_settings
+from torch._utils import ExceptionWrapper
+from torch.utils.data import (
+    BatchSampler,
+    Dataset,
+    IterableDataset,
+    IterDataPipe,
+    MapDataPipe,
+    RandomSampler,
+    Sampler,
+    SequentialSampler,
+    _utils,
+)
+from torch.utils.data.datapipes.datapipe import _IterDataPipeSerializationWrapper, _MapDataPipeSerializationWrapper
+from ._data_worker import _worker_loop
+__all__ = ["RRSDataLoader"]
+T_co = TypeVar("T_co", covariant=True)
+T = TypeVar("T")
+_worker_init_fn_t = Callable[[int], None]
+# Ideally we would parameterize `DataLoader` by the return type of `collate_fn`, but there is currently no way to have that
+# type parameter set to a default value if the user doesn't pass in a custom 'collate_fn'.
+# See https://github.com/python/mypy/issues/3737.
+_collate_fn_t = Callable[[List[T]], Any]
+# These functions used to be defined in this file. However, it was moved to
+# _utils/collate.py. Although it is rather hard to access this from user land
+# (one has to explicitly directly `import torch.utils.data.dataloader`), there
+# probably is user code out there using it. This aliasing maintains BC in this
+# aspect.
+default_collate: _collate_fn_t = _utils.collate.default_collate
+default_convert = _utils.collate.default_convert
+get_worker_info = _utils.worker.get_worker_info
+logger = logging.getLogger(__name__)
+class _DatasetKind:
+    Map = 0
+    Iterable = 1
+    @staticmethod
+    def create_fetcher(kind, dataset, auto_collation, collate_fn, drop_last):
+        if kind == _DatasetKind.Map:
+            return _utils.fetch._MapDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
+        else:
+            return _utils.fetch._IterableDatasetFetcher(dataset, auto_collation, collate_fn, drop_last)
+class _InfiniteConstantSampler(Sampler):
+    r"""Analogous to ``itertools.repeat(None, None)``.
+    Used as sampler for :class:`~torch.utils.data.IterableDataset`.
+    Args:
+        data_source (Dataset): dataset to sample from
+    """
+    def __init__(self):
+        super().__init__(None)
+    def __iter__(self):
+        while True:
+            yield None
+def _get_distributed_settings():
+    if dist.is_available() and dist.is_initialized():
+        return dist.get_world_size(), dist.get_rank()
+    else:
+        return 1, 0
+def _sharding_worker_init_fn(worker_init_fn, world_size, rank_id, worker_id):
+    global_worker_id = worker_id
+    info = torch.utils.data.get_worker_info()
+    assert info is not None
+    total_workers = info.num_workers
+    datapipe = info.dataset
+    assert isinstance(datapipe, (IterDataPipe, MapDataPipe))
+    # To distribute elements across distributed process evenly, we should shard data on distributed
+    # processes first then shard on worker processes
+    total_workers *= world_size
+    global_worker_id = global_worker_id * world_size + rank_id
+    # For BC, use default SHARDING_PRIORITIES
+    torch.utils.data.graph_settings.apply_sharding(datapipe, total_workers, global_worker_id)
+    if worker_init_fn is not None:
+        worker_init_fn(worker_id)
+def _share_dist_seed(generator, pg):
+    _shared_seed = torch.empty((), dtype=torch.int64).random_(generator=generator)
+    if isinstance(pg, dist.ProcessGroup):
+        dist.broadcast(_shared_seed, src=0, group=pg)
+    return _shared_seed.item()
+class RRSDataLoader(Generic[T_co]):
+    r"""
+    Data loader. Combines a dataset and a sampler, and provides an iterable over
+    the given dataset.
+    The :class:`~torch.utils.data.DataLoader` supports both map-style and
+    iterable-style datasets with single- or multi-process loading, customizing
+    loading order and optional automatic batching (collation) and memory pinning.
+    See :py:mod:`torch.utils.data` documentation page for more details.
+    Args:
+        dataset (Dataset): dataset from which to load the data.
+        batch_size (int, optional): how many samples per batch to load
+            (default: ``1``).
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        sampler (Sampler or Iterable, optional): defines the strategy to draw
+            samples from the dataset. Can be any ``Iterable`` with ``__len__``
+            implemented. If specified, :attr:`shuffle` must not be specified.
+        batch_sampler (Sampler or Iterable, optional): like :attr:`sampler`, but
+            returns a batch of indices at a time. Mutually exclusive with
+            :attr:`batch_size`, :attr:`shuffle`, :attr:`sampler`,
+            and :attr:`drop_last`.
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main process.
+            (default: ``0``)
+        collate_fn (Callable, optional): merges a list of samples to form a
+            mini-batch of Tensor(s).  Used when using batched loading from a
+            map-style dataset.
+        pin_memory (bool, optional): If ``True``, the data loader will copy Tensors
+            into device/CUDA pinned memory before returning them.  If your data elements
+            are a custom type, or your :attr:`collate_fn` returns a batch that is a custom type,
+            see the example below.
+        drop_last (bool, optional): set to ``True`` to drop the last incomplete batch,
+            if the dataset size is not divisible by the batch size. If ``False`` and
+            the size of dataset is not divisible by the batch size, then the last batch
+            will be smaller. (default: ``False``)
+        timeout (numeric, optional): if positive, the timeout value for collecting a batch
+            from workers. Should always be non-negative. (default: ``0``)
+        worker_init_fn (Callable, optional): If not ``None``, this will be called on each
+            worker subprocess with the worker id (an int in ``[0, num_workers - 1]``) as
+            input, after seeding and before data loading. (default: ``None``)
+        generator (torch.Generator, optional): If not ``None``, this RNG will be used
+            by RandomSampler to generate random indexes and multiprocessing to generate
+            `base_seed` for workers. (default: ``None``)
+        prefetch_factor (int, optional, keyword-only arg): Number of batches loaded
+            in advance by each worker. ``2`` means there will be a total of
+            2 * num_workers batches prefetched across all workers. (default value depends
+            on the set value for num_workers. If value of num_workers=0 default is ``None``.
+            Otherwise if value of num_workers>0 default is ``2``).
+        persistent_workers (bool, optional): If ``True``, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once. This allows to
+            maintain the workers `Dataset` instances alive. (default: ``False``)
+        pin_memory_device (str, optional): the data loader will copy Tensors
+            into device pinned memory before returning them if pin_memory is set to true.
+    .. warning:: If the ``spawn`` start method is used, :attr:`worker_init_fn`
+                 cannot be an unpicklable object, e.g., a lambda function. See
+                 :ref:`multiprocessing-best-practices` on more details related
+                 to multiprocessing in PyTorch.
+    .. warning:: ``len(dataloader)`` heuristic is based on the length of the sampler used.
+                 When :attr:`dataset` is an :class:`~torch.utils.data.IterableDataset`,
+                 it instead returns an estimate based on ``len(dataset) / batch_size``, with proper
+                 rounding depending on :attr:`drop_last`, regardless of multi-process loading
+                 configurations. This represents the best guess PyTorch can make because PyTorch
+                 trusts user :attr:`dataset` code in correctly handling multi-process
+                 loading to avoid duplicate data.
+                 However, if sharding results in multiple workers having incomplete last batches,
+                 this estimate can still be inaccurate, because (1) an otherwise complete batch can
+                 be broken into multiple ones and (2) more than one batch worth of samples can be
+                 dropped when :attr:`drop_last` is set. Unfortunately, PyTorch can not detect such
+                 cases in general.
+                 See `Dataset Types`_ for more details on these two types of datasets and how
+                 :class:`~torch.utils.data.IterableDataset` interacts with
+                 `Multi-process data loading`_.
+    .. warning:: See :ref:`reproducibility`, and :ref:`dataloader-workers-random-seed`, and
+                 :ref:`data-loading-randomness` notes for random seed related questions.
+    """
+    dataset: Dataset[T_co]
+    batch_size: Optional[int]
+    num_workers: int
+    pin_memory: bool
+    drop_last: bool
+    timeout: float
+    sampler: Union[Sampler, Iterable]
+    pin_memory_device: str
+    prefetch_factor: Optional[int]
+    _iterator: Optional["_BaseDataLoaderIter"]
+    __initialized = False
+    def __init__(
+        self,
+        dataset: Dataset[T_co],
+        batch_size: Optional[int] = 1,
+        shuffle: Optional[bool] = None,
+        sampler: Union[Sampler, Iterable, None] = None,
+        batch_sampler: Union[Sampler[Sequence], Iterable[Sequence], None] = None,
+        num_workers: int = 0,
+        collate_fn: Optional[_collate_fn_t] = None,
+        pin_memory: bool = False,
+        drop_last: bool = False,
+        timeout: float = 0,
+        worker_init_fn: Optional[_worker_init_fn_t] = None,
+        multiprocessing_context=None,
+        generator=None,
+        *,
+        prefetch_factor: Optional[int] = None,
+        persistent_workers: bool = False,
+        pin_memory_device: str = ""
+    ):
+        torch._C._log_api_usage_once("python.data_loader")
+        if num_workers < 0:
+            raise ValueError(
+                "num_workers option should be non-negative; " "use num_workers=0 to disable multiprocessing."
+            )
+        if timeout < 0:
+            raise ValueError("timeout option should be non-negative")
+        if num_workers == 0 and prefetch_factor is not None:
+            raise ValueError(
+                "prefetch_factor option could only be specified in multiprocessing."
+                "let num_workers > 0 to enable multiprocessing, otherwise set prefetch_factor to None."
+            )
+        elif num_workers > 0 and prefetch_factor is None:
+            prefetch_factor = 2
+        elif prefetch_factor is not None and prefetch_factor < 0:
+            raise ValueError("prefetch_factor option should be non-negative")
+        if persistent_workers and num_workers == 0:
+            raise ValueError("persistent_workers option needs num_workers > 0")
+        self.dataset = dataset
+        self.num_workers = num_workers
+        self.prefetch_factor = prefetch_factor
+        self.pin_memory = pin_memory
+        self.pin_memory_device = pin_memory_device
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        self.multiprocessing_context = multiprocessing_context
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   _DataPipeSerializationWrapper container makes it easier to serialize without redefining pickler
+        if isinstance(self.dataset, IterDataPipe):
+            self.dataset = _IterDataPipeSerializationWrapper(self.dataset)
+        elif isinstance(self.dataset, MapDataPipe):
+            self.dataset = _MapDataPipeSerializationWrapper(self.dataset)
+        # Arg-check dataset related before checking samplers because we want to
+        # tell users that iterable-style datasets are incompatible with custom
+        # samplers first, so that they don't learn that this combo doesn't work
+        # after spending time fixing the custom sampler errors.
+        if isinstance(dataset, IterableDataset):
+            self._dataset_kind = _DatasetKind.Iterable
+            # NOTE [ Custom Samplers and IterableDataset ]
+            #
+            # `IterableDataset` does not support custom `batch_sampler` or
+            # `sampler` since the key is irrelevant (unless we support
+            # generator-style dataset one day...).
+            #
+            # For `sampler`, we always create a dummy sampler. This is an
+            # infinite sampler even when the dataset may have an implemented
+            # finite `__len__` because in multi-process data loading, naive
+            # settings will return duplicated data (which may be desired), and
+            # thus using a sampler with length matching that of dataset will
+            # cause data lost (you may have duplicates of the first couple
+            # batches, but never see anything afterwards). Therefore,
+            # `Iterabledataset` always uses an infinite sampler, an instance of
+            # `_InfiniteConstantSampler` defined above.
+            #
+            # A custom `batch_sampler` essentially only controls the batch size.
+            # However, it is unclear how useful it would be since an iterable-style
+            # dataset can handle that within itself. Moreover, it is pointless
+            # in multi-process data loading as the assignment order of batches
+            # to workers is an implementation detail so users can not control
+            # how to batchify each worker's iterable. Thus, we disable this
+            # option. If this turns out to be useful in future, we can re-enable
+            # this, and support custom samplers that specify the assignments to
+            # specific workers.
+            if isinstance(dataset, IterDataPipe):
+                if shuffle is not None:
+                    dataset = torch.utils.data.graph_settings.apply_shuffle_settings(dataset, shuffle=shuffle)
+            # We cannot check `shuffle is not None` here, since previously `shuffle=False` was the default.
+            elif shuffle not in {False, None}:
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified "
+                    "shuffle option, but got shuffle={}".format(shuffle)
+                )
+            if sampler is not None:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified "
+                    "sampler option, but got sampler={}".format(sampler)
+                )
+            elif batch_sampler is not None:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                raise ValueError(
+                    "DataLoader with IterableDataset: expected unspecified "
+                    "batch_sampler option, but got batch_sampler={}".format(batch_sampler)
+                )
+        else:
+            shuffle = bool(shuffle)
+            self._dataset_kind = _DatasetKind.Map
+        if sampler is not None and shuffle:
+            raise ValueError("sampler option is mutually exclusive with " "shuffle")
+        if batch_sampler is not None:
+            # auto_collation with custom batch_sampler
+            if batch_size != 1 or shuffle or sampler is not None or drop_last:
+                raise ValueError(
+                    "batch_sampler option is mutually exclusive " "with batch_size, shuffle, sampler, and " "drop_last"
+                )
+            batch_size = None
+            drop_last = False
+        elif batch_size is None:
+            # no auto_collation
+            if drop_last:
+                raise ValueError(
+                    "batch_size=None option disables auto-batching " "and is mutually exclusive with drop_last"
+                )
+        if sampler is None:  # give default samplers
+            if self._dataset_kind == _DatasetKind.Iterable:
+                # See NOTE [ Custom Samplers and IterableDataset ]
+                sampler = _InfiniteConstantSampler()
+            else:  # map-style
+                if shuffle:
+                    sampler = RandomSampler(dataset, generator=generator)  # type: ignore[arg-type]
+                else:
+                    sampler = SequentialSampler(dataset)  # type: ignore[arg-type]
+        if batch_size is not None and batch_sampler is None:
+            # auto_collation without custom batch_sampler
+            batch_sampler = BatchSampler(sampler, batch_size, drop_last)
+        self.batch_size = batch_size
+        self.drop_last = drop_last
+        self.sampler = sampler
+        self.batch_sampler = batch_sampler
+        self.generator = generator
+        if collate_fn is None:
+            if self._auto_collation:
+                collate_fn = _utils.collate.default_collate
+            else:
+                collate_fn = _utils.collate.default_convert
+        self.collate_fn = collate_fn
+        self.persistent_workers = persistent_workers
+        self.__initialized = True
+        self._IterableDataset_len_called = None  # See NOTE [ IterableDataset and __len__ ]
+        self._iterator = None
+        self.check_worker_number_rationality()
+        torch.set_vital("Dataloader", "enabled", "True")  # type: ignore[attr-defined]
+    def _get_iterator(self) -> "_BaseDataLoaderIter":
+        if self.num_workers == 0:
+            return _SingleProcessDataLoaderIter(self)
+        else:
+            self.check_worker_number_rationality()
+            return _MultiProcessingDataLoaderIter(self)
+    @property
+    def multiprocessing_context(self):
+        return self.__multiprocessing_context
+    @multiprocessing_context.setter
+    def multiprocessing_context(self, multiprocessing_context):
+        if multiprocessing_context is not None:
+            if self.num_workers > 0:
+                if isinstance(multiprocessing_context, str):
+                    valid_start_methods = multiprocessing.get_all_start_methods()
+                    if multiprocessing_context not in valid_start_methods:
+                        raise ValueError(
+                            (
+                                "multiprocessing_context option "
+                                "should specify a valid start method in {!r}, but got "
+                                "multiprocessing_context={!r}"
+                            ).format(valid_start_methods, multiprocessing_context)
+                        )
+                    multiprocessing_context = multiprocessing.get_context(multiprocessing_context)
+                if not isinstance(multiprocessing_context, python_multiprocessing.context.BaseContext):
+                    raise TypeError(
+                        (
+                            "multiprocessing_context option should be a valid context "
+                            "object or a string specifying the start method, but got "
+                            "multiprocessing_context={}"
+                        ).format(multiprocessing_context)
+                    )
+            else:
+                raise ValueError(
+                    (
+                        "multiprocessing_context can only be used with "
+                        "multi-process loading (num_workers > 0), but got "
+                        "num_workers={}"
+                    ).format(self.num_workers)
+                )
+        self.__multiprocessing_context = multiprocessing_context
+    def __setattr__(self, attr, val):
+        if self.__initialized and attr in (
+            "batch_size",
+            "batch_sampler",
+            "sampler",
+            "drop_last",
+            "dataset",
+            "persistent_workers",
+        ):
+            raise ValueError(
+                "{} attribute should not be set after {} is " "initialized".format(attr, self.__class__.__name__)
+            )
+        super().__setattr__(attr, val)
+    # We quote '_BaseDataLoaderIter' since it isn't defined yet and the definition can't be moved up
+    # since '_BaseDataLoaderIter' references 'DataLoader'.
+    def __iter__(self) -> "_BaseDataLoaderIter":
+        # When using a single worker the returned iterator should be
+        # created everytime to avoid reseting its state
+        # However, in the case of a multiple workers iterator
+        # the iterator is only created once in the lifetime of the
+        # DataLoader object so that workers can be reused
+        if self.persistent_workers and self.num_workers > 0:
+            if self._iterator is None:
+                self._iterator = self._get_iterator()
+            else:
+                self._iterator._reset(self)
+            return self._iterator
+        else:
+            return self._get_iterator()
+    @property
+    def _auto_collation(self):
+        return self.batch_sampler is not None
+    @property
+    def _index_sampler(self):
+        # The actual sampler used for generating indices for `_DatasetFetcher`
+        # (see _utils/fetch.py) to read data at each time. This would be
+        # `.batch_sampler` if in auto-collation mode, and `.sampler` otherwise.
+        # We can't change `.sampler` and `.batch_sampler` attributes for BC
+        # reasons.
+        if self._auto_collation:
+            return self.batch_sampler
+        else:
+            return self.sampler
+    def __len__(self) -> int:
+        if self._dataset_kind == _DatasetKind.Iterable:
+            # NOTE [ IterableDataset and __len__ ]
+            #
+            # For `IterableDataset`, `__len__` could be inaccurate when one naively
+            # does multi-processing data loading, since the samples will be duplicated.
+            # However, no real use case should be actually using that behavior, so
+            # it should count as a user error. We should generally trust user
+            # code to do the proper thing (e.g., configure each replica differently
+            # in `__iter__`), and give us the correct `__len__` if they choose to
+            # implement it (this will still throw if the dataset does not implement
+            # a `__len__`).
+            #
+            # To provide a further warning, we track if `__len__` was called on the
+            # `DataLoader`, save the returned value in `self._len_called`, and warn
+            # if the iterator ends up yielding more than this number of samples.
+            # Cannot statically verify that dataset is Sized
+            length = self._IterableDataset_len_called = len(self.dataset)  # type: ignore[assignment, arg-type]
+            if self.batch_size is not None:  # IterableDataset doesn't allow custom sampler or batch_sampler
+                from math import ceil
+                if self.drop_last:
+                    length = length // self.batch_size
+                else:
+                    length = ceil(length / self.batch_size)
+            return length
+        else:
+            return len(self._index_sampler)
+    def check_worker_number_rationality(self):
+        # This function check whether the dataloader's worker number is rational based on
+        # current system's resource. Current rule is that if the number of workers this
+        # Dataloader will create is bigger than the number of logical cpus that is allowed to
+        # use, than we will pop up a warning to let user pay attention.
+        #
+        # eg. If current system has 2 physical CPUs with 16 cores each. And each core support 2
+        #     threads, then the total logical cpus here is 2 * 16 * 2 = 64. Let's say current
+        #     DataLoader process can use half of them which is 32, then the rational max number of
+        #     worker that initiated from this process is 32.
+        #     Now, let's say the created DataLoader has num_works = 40, which is bigger than 32.
+        #     So the warning message is triggered to notify the user to lower the worker number if
+        #     necessary.
+        #
+        #
+        # [Note] Please note that this function repects `cpuset` only when os.sched_getaffinity is
+        #        available (available in most of Linux system, but not OSX and Windows).
+        #        When os.sched_getaffinity is not available, os.cpu_count() is called instead, but
+        #        it doesn't repect cpuset.
+        #        We don't take threading into account since each worker process is single threaded
+        #        at this time.
+        #
+        #        We don't set any threading flags (eg. OMP_NUM_THREADS, MKL_NUM_THREADS, etc)
+        #        other than `torch.set_num_threads` to 1 in the worker process, if the passing
+        #        in functions use 3rd party modules that rely on those threading flags to determine
+        #        how many thread to create (eg. numpy, etc), then it is caller's responsibility to
+        #        set those flags correctly.
+        def _create_warning_msg(num_worker_suggest, num_worker_created, cpuset_checked):
+            suggested_max_worker_msg = (
+                (
+                    (
+                        "Our suggested max number of worker in current system is {}{}, which is smaller "
+                        "than what this DataLoader is going to create."
+                    ).format(
+                        num_worker_suggest,
+                        ("" if cpuset_checked else " (`cpuset` is not taken into account)"),
+                    )
+                )
+                if num_worker_suggest is not None
+                else ("DataLoader is not able to compute a suggested max number of worker in current system.")
+            )
+            warn_msg = (
+                "This DataLoader will create {} worker processes in total. {} "
+                "Please be aware that excessive worker creation might get DataLoader running slow or even freeze, "
+                "lower the worker number to avoid potential slowness/freeze if necessary."
+            ).format(num_worker_created, suggested_max_worker_msg)
+            return warn_msg
+        if not self.num_workers or self.num_workers == 0:
+            return
+        # try to compute a suggested max number of worker based on system's resource
+        max_num_worker_suggest = None
+        cpuset_checked = False
+        if hasattr(os, "sched_getaffinity"):
+            try:
+                max_num_worker_suggest = len(os.sched_getaffinity(0))
+                cpuset_checked = True
+            except Exception:
+                pass
+        if max_num_worker_suggest is None:
+            # os.cpu_count() could return Optional[int]
+            # get cpu count first and check None in order to satify mypy check
+            cpu_count = os.cpu_count()
+            if cpu_count is not None:
+                max_num_worker_suggest = cpu_count
+        if max_num_worker_suggest is None:
+            warnings.warn(_create_warning_msg(max_num_worker_suggest, self.num_workers, cpuset_checked))
+            return
+        if self.num_workers > max_num_worker_suggest:
+            warnings.warn(_create_warning_msg(max_num_worker_suggest, self.num_workers, cpuset_checked))
+class _BaseDataLoaderIter:
+    def __init__(self, loader: RRSDataLoader) -> None:
+        self._dataset = loader.dataset
+        self._shared_seed = None
+        self._pg = None
+        if isinstance(self._dataset, IterDataPipe):
+            if dist.is_available() and dist.is_initialized():
+                self._pg = dist.new_group(backend="gloo")
+            self._shared_seed = _share_dist_seed(loader.generator, self._pg)
+            shared_rng = torch.Generator()
+            shared_rng.manual_seed(self._shared_seed)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
+        self._dataset_kind = loader._dataset_kind
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+        self._auto_collation = loader._auto_collation
+        self._drop_last = loader.drop_last
+        self._index_sampler = loader._index_sampler
+        self._num_workers = loader.num_workers
+        ws, rank = _get_distributed_settings()
+        self._world_size = ws
+        self._rank = rank
+        # for other backends, pin_memory_device need to set. if not set
+        # default behaviour is CUDA device. if pin_memory_device is selected
+        # and pin_memory is not set, the default behaviour false.
+        if len(loader.pin_memory_device) == 0:
+            self._pin_memory = loader.pin_memory and torch.cuda.is_available()
+            self._pin_memory_device = None
+        else:
+            if not loader.pin_memory:
+                warn_msg = (
+                    "pin memory device is set and pin_memory flag is not used then device pinned memory won't be used"
+                    "please set pin_memory to true, if you need to use the device pin memory"
+                )
+                warnings.warn(warn_msg)
+            self._pin_memory = loader.pin_memory
+            self._pin_memory_device = loader.pin_memory_device
+        self._timeout = loader.timeout
+        self._collate_fn = loader.collate_fn
+        self._sampler_iter = iter(self._index_sampler)
+        self._base_seed = torch.empty((), dtype=torch.int64).random_(generator=loader.generator).item()
+        self._persistent_workers = loader.persistent_workers
+        self._num_yielded = 0
+        self._profile_name = "enumerate(DataLoader)#{}.__next__".format(self.__class__.__name__)
+    def __iter__(self) -> "_BaseDataLoaderIter":
+        return self
+    def _reset(self, loader, first_iter=False):
+        self._sampler_iter = iter(self._index_sampler)
+        self._num_yielded = 0
+        self._IterableDataset_len_called = loader._IterableDataset_len_called
+        if isinstance(self._dataset, IterDataPipe):
+            self._shared_seed = _share_dist_seed(loader.generator, self._pg)
+            shared_rng = torch.Generator()
+            shared_rng.manual_seed(self._shared_seed)
+            self._dataset = torch.utils.data.graph_settings.apply_random_seed(self._dataset, shared_rng)
+    def _next_index(self):
+        return next(self._sampler_iter)  # may raise StopIteration
+    def _next_data(self):
+        raise NotImplementedError
+    def __next__(self) -> Any:
+        with torch.autograd.profiler.record_function(self._profile_name):
+            if self._sampler_iter is None:
+                # TODO(https://github.com/pytorch/pytorch/issues/76750)
+                self._reset()  # type: ignore[call-arg]
+            data = self._next_data()
+            self._num_yielded += 1
+            if (
+                self._dataset_kind == _DatasetKind.Iterable
+                and self._IterableDataset_len_called is not None
+                and self._num_yielded > self._IterableDataset_len_called
+            ):
+                warn_msg = (
+                    "Length of IterableDataset {} was reported to be {} (when accessing len(dataloader)), but {} "
+                    "samples have been fetched. "
+                ).format(self._dataset, self._IterableDataset_len_called, self._num_yielded)
+                if self._num_workers > 0:
+                    warn_msg += (
+                        "For multiprocessing data-loading, this could be caused by not properly configuring the "
+                        "IterableDataset replica at each worker. Please see "
+                        "https://pytorch.org/docs/stable/data.html#torch.utils.data.IterableDataset for examples."
+                    )
+                warnings.warn(warn_msg)
+            return data
+    def __len__(self) -> int:
+        return len(self._index_sampler)
+    def __getstate__(self):
+        # TODO: add limited pickling support for sharing an iterator
+        # across multiple threads for HOGWILD.
+        # Probably the best way to do this is by moving the sample pushing
+        # to a separate thread and then just sharing the data queue
+        # but signalling the end is tricky without a non-blocking API
+        raise NotImplementedError("{} cannot be pickled", self.__class__.__name__)
+class _SingleProcessDataLoaderIter(_BaseDataLoaderIter):
+    def __init__(self, loader):
+        super().__init__(loader)
+        assert self._timeout == 0
+        assert self._num_workers == 0
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   Taking care of distributed sharding
+        if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
+            # For BC, use default SHARDING_PRIORITIES
+            torch.utils.data.graph_settings.apply_sharding(self._dataset, self._world_size, self._rank)
+        self._dataset_fetcher = _DatasetKind.create_fetcher(
+            self._dataset_kind,
+            self._dataset,
+            self._auto_collation,
+            self._collate_fn,
+            self._drop_last,
+        )
+    def _next_data(self):
+        index = self._next_index()  # may raise StopIteration
+        data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
+        if self._pin_memory:
+            data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
+        return data
+class _MultiProcessingDataLoaderIter(_BaseDataLoaderIter):
+    r"""Iterates once over the DataLoader's dataset, as specified by the sampler"""
+    # NOTE [ Data Loader Multiprocessing Shutdown Logic ]
+    #
+    # Preliminary:
+    #
+    # Our data model looks like this (queues are indicated with curly brackets):
+    #
+    #                main process                              ||
+    #                     |                                    ||
+    #               {index_queue}                              ||
+    #                     |                                    ||
+    #              worker processes                            ||     DATA
+    #                     |                                    ||
+    #            {worker_result_queue}                         ||     FLOW
+    #                     |                                    ||
+    #      pin_memory_thread of main process                   ||   DIRECTION
+    #                     |                                    ||
+    #               {data_queue}                               ||
+    #                     |                                    ||
+    #                data output                               \/
+    #
+    # P.S. `worker_result_queue` and `pin_memory_thread` part may be omitted if
+    #      `pin_memory=False`.
+    #
+    #
+    # Terminating multiprocessing logic requires very careful design. In
+    # particular, we need to make sure that
+    #
+    #   1. The iterator gracefully exits the workers when its last reference is
+    #      gone or it is depleted.
+    #
+    #      In this case, the workers should be gracefully exited because the
+    #      main process may still need to continue to run, and we want cleaning
+    #      up code in the workers to be executed (e.g., releasing GPU memory).
+    #      Naturally, we implement the shutdown logic in `__del__` of
+    #      DataLoaderIterator.
+    #
+    #      We delay the discussion on the logic in this case until later.
+    #
+    #   2. The iterator exits the workers when the loader process and/or worker
+    #      processes exits normally or with error.
+    #
+    #      We set all workers and `pin_memory_thread` to have `daemon=True`.
+    #
+    #      You may ask, why can't we make the workers non-daemonic, and
+    #      gracefully exit using the same logic as we have in `__del__` when the
+    #      iterator gets deleted (see 1 above)?
+    #
+    #      First of all, `__del__` is **not** guaranteed to be called when
+    #      interpreter exits. Even if it is called, by the time it executes,
+    #      many Python core library resources may alreay be freed, and even
+    #      simple things like acquiring an internal lock of a queue may hang.
+    #      Therefore, in this case, we actually need to prevent `__del__` from
+    #      being executed, and rely on the automatic termination of daemonic
+    #      children.
+    #
+    #      Thus, we register an `atexit` hook that sets a global flag
+    #      `_utils.python_exit_status`. Since `atexit` hooks are executed in the
+    #      reverse order of registration, we are guaranteed that this flag is
+    #      set before library resources we use are freed (which, at least in
+    #      CPython, is done via an `atexit` handler defined in
+    #      `multiprocessing/util.py`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/util.py#L320-L362
+    #      registered when an object requiring this mechanism is first
+    #      created, e.g., `mp.Queue`
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/context.py#L100-L103
+    #      https://github.com/python/cpython/blob/c606624af8d4cb3b4a052fb263bb983b3f87585b/Lib/multiprocessing/queues.py#L29
+    #      )
+    #
+    #      So in `__del__`, we check if `_utils.python_exit_status` is set or
+    #      `None` (freed), and perform no-op if so.
+    #
+    #      However, simply letting library clean-up codes run can also be bad,
+    #      because such codes (i.e., `multiprocessing.util._exit_function()`)
+    #      include join putting threads for `mp.Queue`, which can be blocking.
+    #      Hence, the main process putting threads are called with
+    #      `cancel_join_thread` at creation.  See later section
+    #      [ 3b. A process won't hang when putting into a queue; ]
+    #      for more details.
+    #
+    #      Here are two example cases where library clean-up codes can run
+    #      before `__del__` is called:
+    #
+    #        1. If we hold onto a reference to the iterator, it more often
+    #           than not tries to do `multiprocessing` library cleaning before
+    #           clearing the alive referenced objects (https://github.com/pytorch/pytorch/issues/48666)
+    #           and thus prevents our cleaning-up code to run first.
+    #
+    #        2. A similar issue araises when a `DataLoader` is used in a subprocess.
+    #           When a process ends, it shuts the all its daemonic children
+    #           down with a SIGTERM (instead of joining them without a timeout).
+    #           Simiarly for threads, but by a different mechanism. This fact,
+    #           together with a few implementation details of multiprocessing, forces
+    #           us to make workers daemonic. All of our problems arise when a
+    #           DataLoader is used in a subprocess, and are caused by multiprocessing
+    #           code which looks more or less like this:
+    #
+    #               try:
+    #                   your_function_using_a_dataloader()
+    #               finally:
+    #                   multiprocessing.util._exit_function()
+    #
+    #           The joining/termination mentioned above happens inside
+    #           `_exit_function()`. Now, if `your_function_using_a_dataloader()`
+    #           throws, the stack trace stored in the exception will prevent the
+    #           frame which uses `DataLoaderIter` to be freed. If the frame has any
+    #           reference to the `DataLoaderIter` (e.g., in a method of the iter),
+    #           its  `__del__`, which starts the shutdown procedure, will not be
+    #           called. That, in turn, means that workers aren't notified. Attempting
+    #           to join in `_exit_function` will then result in a hang.
+    #
+    #           For context, `_exit_function` is also registered as an `atexit` call.
+    #           So it is unclear to me (@ssnl) why this is needed in a finally block.
+    #           The code dates back to 2008 and there is no comment on the original
+    #           PEP 371 or patch https://bugs.python.org/issue3050 (containing both
+    #           the finally block and the `atexit` registration) that explains this.
+    #
+    #
+    #      Finally, another choice is to just shutdown workers with logic in 1
+    #      above whenever we see an error in `next`. This isn't ideal because
+    #        a. It prevents users from using try-catch to resume data loading.
+    #        b. It doesn't prevent hanging if users have references to the
+    #           iterator.
+    #
+    #   3. All processes exit if any of them die unexpectedly by fatal signals.
+    #
+    #      As shown above, the workers are set as daemonic children of the main
+    #      process. However, automatic cleaning-up of such child processes only
+    #      happens if the parent process exits gracefully (e.g., not via fatal
+    #      signals like SIGKILL). So we must ensure that each process will exit
+    #      even the process that should send/receive data to/from it were
+    #      killed, i.e.,
+    #
+    #        a. A process won't hang when getting from a queue.
+    #
+    #           Even with carefully designed data dependencies (i.e., a `put()`
+    #           always corresponding to a `get()`), hanging on `get()` can still
+    #           happen when data in queue is corrupted (e.g., due to
+    #           `cancel_join_thread` or unexpected exit).
+    #
+    #           For child exit, we set a timeout whenever we try to get data
+    #           from `data_queue`, and check the workers' status on each timeout
+    #           and error.
+    #           See `_DataLoaderiter._get_batch()` and
+    #           `_DataLoaderiter._try_get_data()` for details.
+    #
+    #           Additionally, for child exit on non-Windows platforms, we also
+    #           register a SIGCHLD handler (which is supported on Windows) on
+    #           the main process, which checks if any of the workers fail in the
+    #           (Python) handler. This is more efficient and faster in detecting
+    #           worker failures, compared to only using the above mechanism.
+    #           See `DataLoader.cpp` and `_utils/signal_handling.py` for details.
+    #
+    #           For `.get()` calls where the sender(s) is not the workers, we
+    #           guard them with timeouts, and check the status of the sender
+    #           when timeout happens:
+    #             + in the workers, the `_utils.worker.ManagerWatchdog` class
+    #               checks the status of the main process.
+    #             + if `pin_memory=True`, when getting from `pin_memory_thread`,
+    #               check `pin_memory_thread` status periodically until `.get()`
+    #               returns or see that `pin_memory_thread` died.
+    #
+    #        b. A process won't hang when putting into a queue;
+    #
+    #           We use `mp.Queue` which has a separate background thread to put
+    #           objects from an unbounded buffer array. The background thread is
+    #           daemonic and usually automatically joined when the process
+    #           *exits*.
+    #
+    #           In case that the receiver has ended abruptly while
+    #           reading from the pipe, the join will hang forever.  The usual
+    #           solution for this in Python is calling  `q.cancel_join_thread`,
+    #           which prevents automatically joining it when finalizing
+    #           (exiting).
+    #
+    #           Nonetheless, `cancel_join_thread` must only be called when the
+    #           queue is **not** going to be read from or write into by another
+    #           process, because it may hold onto a lock or leave corrupted data
+    #           in the queue, leading other readers/writers to hang.
+    #
+    #           Hence,
+    #             + For worker processes, we only do so (for their output
+    #               queues, i.e., `worker_result_queue`) before exiting.
+    #             + For `pin_memory_thread`, its output queue `data_queue` is a
+    #               `queue.Queue` that does blocking `put` if the queue is full.
+    #               So there is no above problem, but as a result, in
+    #               `_pin_memory_loop`, we do need to  wrap the `put` in a loop
+    #               that breaks not only upon success, but also when the main
+    #               process stops reading, i.e., is shutting down.
+    #             + For loader process, we `cancel_join_thread()` for all
+    #               `_index_queues` because the whole purpose of workers and
+    #               `pin_memory_thread` is to serve the loader process.  If
+    #               loader process is already exiting, we don't really care if
+    #               the queues are corrupted.
+    #
+    #
+    # Now let's get back to 1:
+    #   how we gracefully exit the workers when the last reference to the
+    #   iterator is gone.
+    #
+    # To achieve this, we implement the following logic along with the design
+    # choices mentioned above:
+    #
+    # `workers_done_event`:
+    #   A `multiprocessing.Event` shared among the main process and all worker
+    #   processes. This is used to signal the workers that the iterator is
+    #   shutting down. After it is set, they will not send processed data to
+    #   queues anymore, and only wait for the final `None` before exiting.
+    #   `done_event` isn't strictly needed. I.e., we can just check for `None`
+    #   from the input queue, but it allows us to skip wasting resources
+    #   processing data if we are already shutting down.
+    #
+    # `pin_memory_thread_done_event`:
+    #   A `threading.Event` for a similar purpose to that of
+    #   `workers_done_event`, but is for the `pin_memory_thread`. The reason
+    #   that separate events are needed is that `pin_memory_thread` reads from
+    #   the output queue of the workers. But the workers, upon seeing that
+    #   `workers_done_event` is set, only wants to see the final `None`, and is
+    #   not required to flush all data in the output queue (e.g., it may call
+    #   `cancel_join_thread` on that queue if its `IterableDataset` iterator
+    #   happens to exhaust coincidentally, which is out of the control of the
+    #   main process). Thus, since we will exit `pin_memory_thread` before the
+    #   workers (see below), two separete events are used.
+    #
+    # NOTE: In short, the protocol is that the main process will set these
+    #       `done_event`s and then the corresponding processes/threads a `None`,
+    #       and that they may exit at any time after receiving the `None`.
+    #
+    # NOTE: Using `None` as the final signal is valid, since normal data will
+    #       always be a 2-tuple with the 1st element being the index of the data
+    #       transferred (different from dataset index/key), and the 2nd being
+    #       either the dataset key or the data sample (depending on which part
+    #       of the data model the queue is at).
+    #
+    # [ worker processes ]
+    #   While loader process is alive:
+    #     Get from `index_queue`.
+    #       If get anything else,
+    #          Check `workers_done_event`.
+    #            If set, continue to next iteration
+    #                    i.e., keep getting until see the `None`, then exit.
+    #            Otherwise, process data:
+    #                If is fetching from an `IterableDataset` and the iterator
+    #                    is exhausted, send an `_IterableDatasetStopIteration`
+    #                    object to signal iteration end. The main process, upon
+    #                    receiving such an object, will send `None` to this
+    #                    worker and not use the corresponding `index_queue`
+    #                    anymore.
+    #       If timed out,
+    #          No matter `workers_done_event` is set (still need to see `None`)
+    #          or not, must continue to next iteration.
+    #   (outside loop)
+    #   If `workers_done_event` is set,  (this can be False with `IterableDataset`)
+    #     `data_queue.cancel_join_thread()`.  (Everything is ending here:
+    #                                          main process won't read from it;
+    #                                          other workers will also call
+    #                                          `cancel_join_thread`.)
+    #
+    # [ pin_memory_thread ]
+    #   # No need to check main thread. If this thread is alive, the main loader
+    #   # thread must be alive, because this thread is set as daemonic.
+    #   While `pin_memory_thread_done_event` is not set:
+    #     Get from `index_queue`.
+    #       If timed out, continue to get in the next iteration.
+    #       Otherwise, process data.
+    #       While `pin_memory_thread_done_event` is not set:
+    #         Put processed data to `data_queue` (a `queue.Queue` with blocking put)
+    #         If timed out, continue to put in the next iteration.
+    #         Otherwise, break, i.e., continuing to the out loop.
+    #
+    #   NOTE: we don't check the status of the main thread because
+    #           1. if the process is killed by fatal signal, `pin_memory_thread`
+    #              ends.
+    #           2. in other cases, either the cleaning-up in __del__ or the
+    #              automatic exit of daemonic thread will take care of it.
+    #              This won't busy-wait either because `.get(timeout)` does not
+    #              busy-wait.
+    #
+    # [ main process ]
+    #   In the DataLoader Iter's `__del__`
+    #     b. Exit `pin_memory_thread`
+    #          i.   Set `pin_memory_thread_done_event`.
+    #          ii   Put `None` in `worker_result_queue`.
+    #          iii. Join the `pin_memory_thread`.
+    #          iv.  `worker_result_queue.cancel_join_thread()`.
+    #
+    #     c. Exit the workers.
+    #          i.   Set `workers_done_event`.
+    #          ii.  Put `None` in each worker's `index_queue`.
+    #          iii. Join the workers.
+    #          iv.  Call `.cancel_join_thread()` on each worker's `index_queue`.
+    #
+    #        NOTE: (c) is better placed after (b) because it may leave corrupted
+    #              data in `worker_result_queue`, which `pin_memory_thread`
+    #              reads from, in which case the `pin_memory_thread` can only
+    #              happen at timeing out, which is slow. Nonetheless, same thing
+    #              happens if a worker is killed by signal at unfortunate times,
+    #              but in other cases, we are better off having a non-corrupted
+    #              `worker_result_queue` for `pin_memory_thread`.
+    #
+    #   NOTE: If `pin_memory=False`, there is no `pin_memory_thread` and (b)
+    #         can be omitted
+    #
+    # NB: `done_event`s isn't strictly needed. E.g., we can just check for
+    #     `None` from `index_queue`, but it allows us to skip wasting resources
+    #     processing indices already in `index_queue` if we are already shutting
+    #     down.
+    def __init__(self, loader):
+        super().__init__(loader)
+        self._prefetch_factor = loader.prefetch_factor
+        assert self._num_workers > 0
+        assert self._prefetch_factor > 0
+        if loader.multiprocessing_context is None:
+            multiprocessing_context = multiprocessing
+        else:
+            multiprocessing_context = loader.multiprocessing_context
+        self._worker_init_fn = loader.worker_init_fn
+        # Adds forward compatibilities so classic DataLoader can work with DataPipes:
+        #   Additional worker init function will take care of sharding in MP and Distributed
+        if isinstance(self._dataset, (IterDataPipe, MapDataPipe)):
+            self._worker_init_fn = functools.partial(
+                _sharding_worker_init_fn, self._worker_init_fn, self._world_size, self._rank
+            )
+        # No certainty which module multiprocessing_context is
+        self._worker_result_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
+        self._worker_pids_set = False
+        self._shutdown = False
+        self._workers_done_event = multiprocessing_context.Event()
+        self._index_queues = []
+        self._workers = []
+        for i in range(self._num_workers):
+            # No certainty which module multiprocessing_context is
+            index_queue = multiprocessing_context.Queue()  # type: ignore[var-annotated]
+            # Need to `cancel_join_thread` here!
+            # See sections (2) and (3b) above.
+            index_queue.cancel_join_thread()
+            w = multiprocessing_context.Process(
+                target=_worker_loop,
+                args=(
+                    self._dataset_kind,
+                    self._dataset,
+                    index_queue,
+                    self._worker_result_queue,
+                    self._workers_done_event,
+                    self._auto_collation,
+                    self._collate_fn,
+                    self._drop_last,
+                    self._base_seed,
+                    self._worker_init_fn,
+                    i,
+                    self._num_workers,
+                    self._persistent_workers,
+                    self._shared_seed,
+                ),
+            )
+            w.daemon = True
+            # NB: Process.start() actually take some time as it needs to
+            #     start a process and pass the arguments over via a pipe.
+            #     Therefore, we only add a worker to self._workers list after
+            #     it started, so that we do not call .join() if program dies
+            #     before it starts, and __del__ tries to join but will get:
+            #     AssertionError: can only join a started process.
+            w.start()
+            self._index_queues.append(index_queue)
+            self._workers.append(w)
+        if self._pin_memory:
+            self._pin_memory_thread_done_event = threading.Event()
+            # Queue is not type-annotated
+            self._data_queue = queue.Queue()  # type: ignore[var-annotated]
+            if self._pin_memory_device == "xpu":
+                current_device = torch.xpu.current_device()  # type: ignore[attr-defined]
+            else:
+                current_device = torch.cuda.current_device()  # choose cuda for default
+            pin_memory_thread = threading.Thread(
+                target=_utils.pin_memory._pin_memory_loop,
+                args=(
+                    self._worker_result_queue,
+                    self._data_queue,
+                    current_device,
+                    self._pin_memory_thread_done_event,
+                    self._pin_memory_device,
+                ),
+            )
+            pin_memory_thread.daemon = True
+            pin_memory_thread.start()
+            # Similar to workers (see comment above), we only register
+            # pin_memory_thread once it is started.
+            self._pin_memory_thread = pin_memory_thread
+        else:
+            self._data_queue = self._worker_result_queue
+        # In some rare cases, persistent workers (daemonic processes)
+        # would be terminated before `__del__` of iterator is invoked
+        # when main process exits
+        # It would cause failure when pin_memory_thread tries to read
+        # corrupted data from worker_result_queue
+        # atexit is used to shutdown thread and child processes in the
+        # right sequence before main process exits
+        if self._persistent_workers and self._pin_memory:
+            import atexit
+            for w in self._workers:
+                atexit.register(_MultiProcessingDataLoaderIter._clean_up_worker, w)
+        # .pid can be None only before process is spawned (not the case, so ignore)
+        _utils.signal_handling._set_worker_pids(id(self), tuple(w.pid for w in self._workers))  # type: ignore[misc]
+        _utils.signal_handling._set_SIGCHLD_handler()
+        self._worker_pids_set = True
+        self._reset(loader, first_iter=True)
+    def _reset(self, loader, first_iter=False):
+        super()._reset(loader, first_iter)
+        self._send_idx = 0  # idx of the next task to be sent to workers
+        self._rcvd_idx = 0  # idx of the next task to be returned in __next__
+        # information about data not yet yielded, i.e., tasks w/ indices in range [rcvd_idx, send_idx).
+        # map: task idx => - (worker_id,)        if data isn't fetched (outstanding)
+        #                  \ (worker_id, data)   if data is already fetched (out-of-order)
+        self._task_info = {}
+        self._tasks_outstanding = 0  # always equal to count(v for v in task_info.values() if len(v) == 1)
+        # A list of booleans representing whether each worker still has work to
+        # do, i.e., not having exhausted its iterable dataset object. It always
+        # contains all `True`s if not using an iterable-style dataset
+        # (i.e., if kind != Iterable).
+        # Not that this indicates that a worker still has work to do *for this epoch*.
+        # It does not mean that a worker is dead. In case of `_persistent_workers`,
+        # the worker will be reset to available in the next epoch.
+        self._workers_status = [True for i in range(self._num_workers)]
+        # Reset the worker queue cycle so it resumes next epoch at worker 0
+        self._worker_queue_idx_cycle = itertools.cycle(range(self._num_workers))
+        # We resume the prefetching in case it was enabled
+        if not first_iter:
+            for idx in range(self._num_workers):
+                self._index_queues[idx].put(_utils.worker._ResumeIteration(self._shared_seed))
+            resume_iteration_cnt = self._num_workers
+            while resume_iteration_cnt > 0:
+                return_idx, return_data = self._get_data()
+                if isinstance(return_idx, _utils.worker._ResumeIteration):
+                    assert return_data is None
+                    resume_iteration_cnt -= 1
+        # prime the prefetch loop
+        for _ in range(self._prefetch_factor * self._num_workers):
+            self._try_put_index()
+    def _try_get_data(self, timeout=_utils.MP_STATUS_CHECK_INTERVAL):
+        # Tries to fetch data from `self._data_queue` once for a given timeout.
+        # This can also be used as inner loop of fetching without timeout, with
+        # the sender status as the loop condition.
+        #
+        # This raises a `RuntimeError` if any worker died expectedly. This error
+        # can come from either the SIGCHLD handler in `_utils/signal_handling.py`
+        # (only for non-Windows platforms), or the manual check below on errors
+        # and timeouts.
+        #
+        # Returns a 2-tuple:
+        #   (bool: whether successfully get data, any: data if successful else None)
+        try:
+            data = self._data_queue.get(timeout=timeout)
+            return (True, data)
+        except Exception as e:
+            # At timeout and error, we manually check whether any worker has
+            # failed. Note that this is the only mechanism for Windows to detect
+            # worker failures.
+            failed_workers = []
+            for worker_id, w in enumerate(self._workers):
+                if self._workers_status[worker_id] and not w.is_alive():
+                    failed_workers.append(w)
+                    self._mark_worker_as_unavailable(worker_id)
+            if len(failed_workers) > 0:
+                pids_str = ", ".join(str(w.pid) for w in failed_workers)
+                raise RuntimeError("DataLoader worker (pid(s) {}) exited unexpectedly".format(pids_str)) from e
+            if isinstance(e, queue.Empty):
+                return (False, None)
+            import errno
+            import tempfile
+            try:
+                # Raise an exception if we are this close to the FDs limit.
+                # Apparently, trying to open only one file is not a sufficient
+                # test.
+                # See NOTE [ DataLoader on Linux and open files limit ]
+                fds_limit_margin = 10
+                fs = [tempfile.NamedTemporaryFile() for i in range(fds_limit_margin)]
+            except OSError as e:
+                if e.errno == errno.EMFILE:
+                    raise RuntimeError(
+                        "Too many open files. Communication with the"
+                        " workers is no longer possible. Please increase the"
+                        " limit using `ulimit -n` in the shell or change the"
+                        " sharing strategy by calling"
+                        " `torch.multiprocessing.set_sharing_strategy('file_system')`"
+                        " at the beginning of your code"
+                    ) from None
+            raise
+    # NOTE [ DataLoader on Linux and open files limit ]
+    #
+    # On Linux when DataLoader is used with multiprocessing we pass the data between
+    # the root process and the workers through SHM files. We remove those files from
+    # the filesystem as soon as they are created and keep them alive by
+    # passing around their file descriptors through AF_UNIX sockets. (See
+    # docs/source/multiprocessing.rst and 'Multiprocessing Technical Notes` in
+    # the wiki (https://github.com/pytorch/pytorch/wiki).)
+    #
+    # This sometimes leads us to exceeding the open files limit. When that happens,
+    # and the offending file descriptor is coming over a socket, the `socket` Python
+    # package silently strips the file descriptor from the message, setting only the
+    # `MSG_CTRUNC` flag (which might be a bit misleading since the manpage says that
+    # it _indicates that some control data were discarded due to lack of space in
+    # the buffer for ancillary data_). This might reflect the C implementation of
+    # AF_UNIX sockets.
+    #
+    # This behaviour can be reproduced with the script and instructions at the
+    # bottom of this note.
+    #
+    # When that happens, the standard Python `multiprocessing` (and not
+    # `torch.multiprocessing`) raises a `RuntimeError: received 0 items of ancdata`
+    #
+    # Sometimes, instead of the FD being stripped, you may get an `OSError:
+    # Too many open files`, both in the script below and in DataLoader. However,
+    # this is rare and seems to be nondeterministic.
+    #
+    #
+    #   #!/usr/bin/env python3
+    #   import sys
+    #   import socket
+    #   import os
+    #   import array
+    #   import shutil
+    #   import socket
+    #
+    #
+    #   if len(sys.argv) != 4:
+    #       print("Usage: ", sys.argv[0], " tmp_dirname iteration (send|recv)")
+    #       sys.exit(1)
+    #
+    #   if __name__ == '__main__':
+    #       dirname = sys.argv[1]
+    #       sock_path = dirname + "/sock"
+    #       iterations = int(sys.argv[2])
+    #       def dummy_path(i):
+    #           return dirname + "/" + str(i) + ".dummy"
+    #
+    #
+    #       if sys.argv[3] == 'send':
+    #           while not os.path.exists(sock_path):
+    #               pass
+    #           client = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+    #           client.connect(sock_path)
+    #           for i in range(iterations):
+    #               fd = os.open(dummy_path(i), os.O_WRONLY | os.O_CREAT)
+    #               ancdata = array.array('i', [fd])
+    #               msg = bytes([i % 256])
+    #               print("Sending fd ", fd, " (iteration #", i, ")")
+    #               client.sendmsg([msg], [(socket.SOL_SOCKET, socket.SCM_RIGHTS, ancdata)])
+    #
+    #
+    #       else:
+    #           assert sys.argv[3] == 'recv'
+    #
+    #           if os.path.exists(dirname):
+    #               raise Exception("Directory exists")
+    #
+    #           os.mkdir(dirname)
+    #
+    #           print("Opening socket...")
+    #           server = socket.socket(socket.AF_UNIX, socket.SOCK_DGRAM)
+    #           server.bind(sock_path)
+    #
+    #           print("Listening...")
+    #           for i in range(iterations):
+    #               a = array.array('i')
+    #               msg, ancdata, flags, addr = server.recvmsg(1, socket.CMSG_SPACE(a.itemsize))
+    #               assert(len(ancdata) == 1)
+    #               cmsg_level, cmsg_type, cmsg_data = ancdata[0]
+    #               a.frombytes(cmsg_data)
+    #               print("Received fd ", a[0], " (iteration #", i, ")")
+    #
+    #           shutil.rmtree(dirname)
+    #
+    # Steps to reproduce:
+    #
+    # 1. Run two shells and set lower file descriptor limit in the receiving one:
+    # (shell1) ulimit -n 1020
+    # (shell2) ulimit -n 1022
+    #
+    # 2. Run the script above with the `recv` option in the first shell
+    # (shell1) ./test_socket.py sock_tmp 1017 recv
+    #
+    # 3. Run the script with the `send` option in the second shell:
+    # (shell2) ./test_socket.py sock_tmp 1017 send
+    def _get_data(self):
+        # Fetches data from `self._data_queue`.
+        #
+        # We check workers' status every `MP_STATUS_CHECK_INTERVAL` seconds,
+        # which we achieve by running `self._try_get_data(timeout=MP_STATUS_CHECK_INTERVAL)`
+        # in a loop. This is the only mechanism to detect worker failures for
+        # Windows. For other platforms, a SIGCHLD handler is also used for
+        # worker failure detection.
+        #
+        # If `pin_memory=True`, we also need check if `pin_memory_thread` had
+        # died at timeouts.
+        if self._timeout > 0:
+            success, data = self._try_get_data(self._timeout)
+            if success:
+                return data
+            else:
+                raise RuntimeError("DataLoader timed out after {} seconds".format(self._timeout))
+        elif self._pin_memory:
+            while self._pin_memory_thread.is_alive():
+                success, data = self._try_get_data()
+                if success:
+                    return data
+            else:
+                # while condition is false, i.e., pin_memory_thread died.
+                raise RuntimeError("Pin memory thread exited unexpectedly")
+            # In this case, `self._data_queue` is a `queue.Queue`,. But we don't
+            # need to call `.task_done()` because we don't use `.join()`.
+        else:
+            while True:
+                success, data = self._try_get_data()
+                if success:
+                    return data
+    def _next_data(self):
+        while True:
+            # If the worker responsible for `self._rcvd_idx` has already ended
+            # and was unable to fulfill this task (due to exhausting an `IterableDataset`),
+            # we try to advance `self._rcvd_idx` to find the next valid index.
+            #
+            # This part needs to run in the loop because both the `self._get_data()`
+            # call and `_IterableDatasetStopIteration` check below can mark
+            # extra worker(s) as dead.
+            while self._rcvd_idx < self._send_idx:
+                info = self._task_info[self._rcvd_idx]
+                worker_id = info[0]
+                if len(info) == 2 or self._workers_status[worker_id]:  # has data or is still active
+                    break
+                del self._task_info[self._rcvd_idx]
+                self._rcvd_idx += 1
+            else:
+                # no valid `self._rcvd_idx` is found (i.e., didn't break)
+                if not self._persistent_workers:
+                    self._shutdown_workers()
+                raise StopIteration
+            # Now `self._rcvd_idx` is the batch index we want to fetch
+            # Check if the next sample has already been generated
+            if len(self._task_info[self._rcvd_idx]) == 2:
+                data = self._task_info.pop(self._rcvd_idx)[1]
+                return self._process_data(data)
+            assert not self._shutdown and self._tasks_outstanding > 0
+            idx, data = self._get_data()
+            self._tasks_outstanding -= 1
+            if self._dataset_kind == _DatasetKind.Iterable:
+                # Check for _IterableDatasetStopIteration
+                if isinstance(data, _utils.worker._IterableDatasetStopIteration):
+                    if self._persistent_workers:
+                        self._workers_status[data.worker_id] = False
+                    else:
+                        self._mark_worker_as_unavailable(data.worker_id)
+                    self._try_put_index()
+                    continue
+            if idx != self._rcvd_idx:
+                # store out-of-order samples
+                self._task_info[idx] += (data,)
+            else:
+                del self._task_info[idx]
+                return self._process_data(data)
+    def _try_put_index(self):
+        assert self._tasks_outstanding < self._prefetch_factor * self._num_workers
+        try:
+            index = self._next_index()
+        except StopIteration:
+            return
+        for _ in range(self._num_workers):  # find the next active worker, if any
+            worker_queue_idx = next(self._worker_queue_idx_cycle)
+            if self._workers_status[worker_queue_idx]:
+                break
+        else:
+            # not found (i.e., didn't break)
+            return
+        self._index_queues[worker_queue_idx].put((self._send_idx, index))
+        self._task_info[self._send_idx] = (worker_queue_idx,)
+        self._tasks_outstanding += 1
+        self._send_idx += 1
+    def _process_data(self, data):
+        self._rcvd_idx += 1
+        self._try_put_index()
+        if isinstance(data, ExceptionWrapper):
+            data.reraise()
+        return data
+    def _mark_worker_as_unavailable(self, worker_id, shutdown=False):
+        # Mark a worker as having finished its work e.g., due to
+        # exhausting an `IterableDataset`. This should be used only when this
+        # `_MultiProcessingDataLoaderIter` is going to continue running.
+        assert self._workers_status[worker_id] or (self._persistent_workers and shutdown)
+        # Signal termination to that specific worker.
+        q = self._index_queues[worker_id]
+        # Indicate that no more data will be put on this queue by the current
+        # process.
+        q.put(None)
+        # Note that we don't actually join the worker here, nor do we remove the
+        # worker's pid from C side struct because (1) joining may be slow, and
+        # (2) since we don't join, the worker may still raise error, and we
+        # prefer capturing those, rather than ignoring them, even though they
+        # are raised after the worker has finished its job.
+        # Joinning is deferred to `_shutdown_workers`, which it is called when
+        # all workers finish their jobs (e.g., `IterableDataset` replicas) or
+        # when this iterator is garbage collected.
+        self._workers_status[worker_id] = False
+        assert self._workers_done_event.is_set() == shutdown
+    def _shutdown_workers(self):
+        # Called when shutting down this `_MultiProcessingDataLoaderIter`.
+        # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on
+        # the logic of this function.
+        if _utils is None or _utils.python_exit_status is True or _utils.python_exit_status is None:
+            # See (2) of the note. If Python is shutting down, do no-op.
+            return
+        # Normal exit when last reference is gone / iterator is depleted.
+        # See (1) and the second half of the note.
+        if not self._shutdown:
+            self._shutdown = True
+            try:
+                # Normal exit when last reference is gone / iterator is depleted.
+                # See (1) and the second half of the note.
+                # Exit `pin_memory_thread` first because exiting workers may leave
+                # corrupted data in `worker_result_queue` which `pin_memory_thread`
+                # reads from.
+                if hasattr(self, "_pin_memory_thread"):
+                    # Use hasattr in case error happens before we set the attribute.
+                    self._pin_memory_thread_done_event.set()
+                    # Send something to pin_memory_thread in case it is waiting
+                    # so that it can wake up and check `pin_memory_thread_done_event`
+                    self._worker_result_queue.put((None, None))
+                    self._pin_memory_thread.join()
+                    self._worker_result_queue.cancel_join_thread()
+                    self._worker_result_queue.close()
+                # Exit workers now.
+                self._workers_done_event.set()
+                for worker_id in range(len(self._workers)):
+                    # Get number of workers from `len(self._workers)` instead of
+                    # `self._num_workers` in case we error before starting all
+                    # workers.
+                    # If we are using workers_status with persistent_workers
+                    # we have to shut it down because the worker is paused
+                    if self._persistent_workers or self._workers_status[worker_id]:
+                        self._mark_worker_as_unavailable(worker_id, shutdown=True)
+                for w in self._workers:
+                    # We should be able to join here, but in case anything went
+                    # wrong, we set a timeout and if the workers fail to join,
+                    # they are killed in the `finally` block.
+                    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+                for q in self._index_queues:
+                    q.cancel_join_thread()
+                    q.close()
+            finally:
+                # Even though all this function does is putting into queues that
+                # we have called `cancel_join_thread` on, weird things can
+                # happen when a worker is killed by a signal, e.g., hanging in
+                # `Event.set()`. So we need to guard this with SIGCHLD handler,
+                # and remove pids from the C side data structure only at the
+                # end.
+                #
+                # FIXME: Unfortunately, for Windows, we are missing a worker
+                #        error detection mechanism here in this function, as it
+                #        doesn't provide a SIGCHLD handler.
+                if self._worker_pids_set:
+                    _utils.signal_handling._remove_worker_pids(id(self))
+                    self._worker_pids_set = False
+                for w in self._workers:
+                    if w.is_alive():
+                        # Existing mechanisms try to make the workers exit
+                        # peacefully, but in case that we unfortunately reach
+                        # here, which we shouldn't, (e.g., pytorch/pytorch#39570),
+                        # we kill the worker.
+                        w.terminate()
+    # staticmethod is used to remove reference to `_MultiProcessingDataLoaderIter`
+    @staticmethod
+    def _clean_up_worker(w):
+        try:
+            w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
+        finally:
+            if w.is_alive():
+                w.terminate()
+    def __del__(self):
+        self._shutdown_workers()

efficientvit/apps/data_provider/random_resolution/_data_worker.py ADDED Viewed

	@@ -0,0 +1,358 @@

+r""""This file is based on torch/utils/data/_utils/worker.py
+Contains definitions of the methods used by the _BaseDataLoaderIter workers.
+These **needs** to be in global scope since Py2 doesn't support serializing
+static methods.
+"""
+import os
+import queue
+import random
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Optional, Union
+import torch
+from torch._utils import ExceptionWrapper
+from torch.utils.data._utils import HAS_NUMPY, IS_WINDOWS, MP_STATUS_CHECK_INTERVAL, signal_handling
+if TYPE_CHECKING:
+    from torch.utils.data import Dataset
+from .controller import RRSController
+if IS_WINDOWS:
+    import ctypes
+    from ctypes.wintypes import BOOL, DWORD, HANDLE
+    # On Windows, the parent ID of the worker process remains unchanged when the manager process
+    # is gone, and the only way to check it through OS is to let the worker have a process handle
+    # of the manager and ask if the process status has changed.
+    class ManagerWatchdog:
+        def __init__(self):
+            self.manager_pid = os.getppid()
+            # mypy cannot detect this code is windows only
+            self.kernel32 = ctypes.WinDLL("kernel32", use_last_error=True)  # type: ignore[attr-defined]
+            self.kernel32.OpenProcess.argtypes = (DWORD, BOOL, DWORD)
+            self.kernel32.OpenProcess.restype = HANDLE
+            self.kernel32.WaitForSingleObject.argtypes = (HANDLE, DWORD)
+            self.kernel32.WaitForSingleObject.restype = DWORD
+            # Value obtained from https://msdn.microsoft.com/en-us/library/ms684880.aspx
+            SYNCHRONIZE = 0x00100000
+            self.manager_handle = self.kernel32.OpenProcess(SYNCHRONIZE, 0, self.manager_pid)
+            if not self.manager_handle:
+                raise ctypes.WinError(ctypes.get_last_error())  # type: ignore[attr-defined]
+            self.manager_dead = False
+        def is_alive(self):
+            if not self.manager_dead:
+                # Value obtained from https://msdn.microsoft.com/en-us/library/windows/desktop/ms687032.aspx
+                self.manager_dead = self.kernel32.WaitForSingleObject(self.manager_handle, 0) == 0
+            return not self.manager_dead
+else:
+    class ManagerWatchdog:  # type: ignore[no-redef]
+        def __init__(self):
+            self.manager_pid = os.getppid()
+            self.manager_dead = False
+        def is_alive(self):
+            if not self.manager_dead:
+                self.manager_dead = os.getppid() != self.manager_pid
+            return not self.manager_dead
+_worker_info = None
+class WorkerInfo:
+    id: int
+    num_workers: int
+    seed: int
+    dataset: "Dataset"
+    __initialized = False
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            setattr(self, k, v)
+        self.__keys = tuple(kwargs.keys())
+        self.__initialized = True
+    def __setattr__(self, key, val):
+        if self.__initialized:
+            raise RuntimeError("Cannot assign attributes to {} objects".format(self.__class__.__name__))
+        return super().__setattr__(key, val)
+    def __repr__(self):
+        items = []
+        for k in self.__keys:
+            items.append("{}={}".format(k, getattr(self, k)))
+        return "{}({})".format(self.__class__.__name__, ", ".join(items))
+def get_worker_info() -> Optional[WorkerInfo]:
+    r"""Returns the information about the current
+    :class:`~torch.utils.data.DataLoader` iterator worker process.
+    When called in a worker, this returns an object guaranteed to have the
+    following attributes:
+    * :attr:`id`: the current worker id.
+    * :attr:`num_workers`: the total number of workers.
+    * :attr:`seed`: the random seed set for the current worker. This value is
+      determined by main process RNG and the worker id. See
+      :class:`~torch.utils.data.DataLoader`'s documentation for more details.
+    * :attr:`dataset`: the copy of the dataset object in **this** process. Note
+      that this will be a different object in a different process than the one
+      in the main process.
+    When called in the main process, this returns ``None``.
+    .. note::
+       When used in a :attr:`worker_init_fn` passed over to
+       :class:`~torch.utils.data.DataLoader`, this method can be useful to
+       set up each worker process differently, for instance, using ``worker_id``
+       to configure the ``dataset`` object to only read a specific fraction of a
+       sharded dataset, or use ``seed`` to seed other libraries used in dataset
+       code.
+    """
+    return _worker_info
+r"""Dummy class used to signal the end of an IterableDataset"""
+@dataclass(frozen=True)
+class _IterableDatasetStopIteration:
+    worker_id: int
+r"""Dummy class used to resume the fetching when worker reuse is enabled"""
+@dataclass(frozen=True)
+class _ResumeIteration:
+    seed: Optional[int] = None
+# The function `_generate_state` is adapted from `numpy.random.SeedSequence`
+# from https://github.com/numpy/numpy/blob/main/numpy/random/bit_generator.pyx
+# It's MIT licensed, here is the copyright:
+# Copyright (c) 2015 Melissa E. O'Neill
+# Copyright (c) 2019 NumPy Developers
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+# This function generates an array of int32 as the seed for
+# `numpy.random`, in order to prevent state collision due to same
+# seed and algorithm for `numpy.random` and `random` modules.
+# TODO: Implement `SeedSequence` like object for `torch.random`
+def _generate_state(base_seed, worker_id):
+    INIT_A = 0x43B0D7E5
+    MULT_A = 0x931E8875
+    INIT_B = 0x8B51F9DD
+    MULT_B = 0x58F38DED
+    MIX_MULT_L = 0xCA01F9DD
+    MIX_MULT_R = 0x4973F715
+    XSHIFT = 4 * 8 // 2
+    MASK32 = 0xFFFFFFFF
+    entropy = [worker_id, base_seed & MASK32, base_seed >> 32, 0]
+    pool = [0] * 4
+    hash_const_A = INIT_A
+    def hash(value):
+        nonlocal hash_const_A
+        value = (value ^ hash_const_A) & MASK32
+        hash_const_A = (hash_const_A * MULT_A) & MASK32
+        value = (value * hash_const_A) & MASK32
+        value = (value ^ (value >> XSHIFT)) & MASK32
+        return value
+    def mix(x, y):
+        result_x = (MIX_MULT_L * x) & MASK32
+        result_y = (MIX_MULT_R * y) & MASK32
+        result = (result_x - result_y) & MASK32
+        result = (result ^ (result >> XSHIFT)) & MASK32
+        return result
+    # Add in the entropy to the pool.
+    for i in range(len(pool)):
+        pool[i] = hash(entropy[i])
+    # Mix all bits together so late bits can affect earlier bits.
+    for i_src in range(len(pool)):
+        for i_dst in range(len(pool)):
+            if i_src != i_dst:
+                pool[i_dst] = mix(pool[i_dst], hash(pool[i_src]))
+    hash_const_B = INIT_B
+    state = []
+    for i_dst in range(4):
+        data_val = pool[i_dst]
+        data_val = (data_val ^ hash_const_B) & MASK32
+        hash_const_B = (hash_const_B * MULT_B) & MASK32
+        data_val = (data_val * hash_const_B) & MASK32
+        data_val = (data_val ^ (data_val >> XSHIFT)) & MASK32
+        state.append(data_val)
+    return state
+def _worker_loop(
+    dataset_kind,
+    dataset,
+    index_queue,
+    data_queue,
+    done_event,
+    auto_collation,
+    collate_fn,
+    drop_last,
+    base_seed,
+    init_fn,
+    worker_id,
+    num_workers,
+    persistent_workers,
+    shared_seed,
+):
+    # See NOTE [ Data Loader Multiprocessing Shutdown Logic ] for details on the
+    # logic of this function.
+    try:
+        # Initialize C side signal handlers for SIGBUS and SIGSEGV. Python signal
+        # module's handlers are executed after Python returns from C low-level
+        # handlers, likely when the same fatal signal had already happened
+        # again.
+        # https://docs.python.org/3/library/signal.html#execution-of-python-signal-handlers
+        signal_handling._set_worker_signal_handlers()
+        torch.set_num_threads(1)
+        seed = base_seed + worker_id
+        random.seed(seed)
+        torch.manual_seed(seed)
+        if HAS_NUMPY:
+            np_seed = _generate_state(base_seed, worker_id)
+            import numpy as np
+            np.random.seed(np_seed)
+        from torch.utils.data import IterDataPipe
+        from torch.utils.data.graph_settings import apply_random_seed
+        shared_rng = torch.Generator()
+        if isinstance(dataset, IterDataPipe):
+            assert shared_seed is not None
+            shared_rng.manual_seed(shared_seed)
+            dataset = apply_random_seed(dataset, shared_rng)
+        global _worker_info
+        _worker_info = WorkerInfo(id=worker_id, num_workers=num_workers, seed=seed, dataset=dataset)
+        from torch.utils.data import _DatasetKind
+        init_exception = None
+        try:
+            if init_fn is not None:
+                init_fn(worker_id)
+            fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, auto_collation, collate_fn, drop_last)
+        except Exception:
+            init_exception = ExceptionWrapper(where="in DataLoader worker process {}".format(worker_id))
+        # When using Iterable mode, some worker can exit earlier than others due
+        # to the IterableDataset behaving differently for different workers.
+        # When such things happen, an `_IterableDatasetStopIteration` object is
+        # sent over to the main process with the ID of this worker, so that the
+        # main process won't send more tasks to this worker, and will send
+        # `None` to this worker to properly exit it.
+        #
+        # Note that we cannot set `done_event` from a worker as it is shared
+        # among all processes. Instead, we set the `iteration_end` flag to
+        # signify that the iterator is exhausted. When either `done_event` or
+        # `iteration_end` is set, we skip all processing step and just wait for
+        # `None`.
+        iteration_end = False
+        watchdog = ManagerWatchdog()
+        while watchdog.is_alive():
+            try:
+                r = index_queue.get(timeout=MP_STATUS_CHECK_INTERVAL)
+            except queue.Empty:
+                continue
+            if isinstance(r, _ResumeIteration):
+                # Acknowledge the main process
+                data_queue.put((r, None))
+                iteration_end = False
+                if isinstance(dataset, IterDataPipe):
+                    assert r.seed is not None
+                    shared_rng.manual_seed(r.seed)
+                    dataset = apply_random_seed(dataset, shared_rng)
+                # Recreate the fetcher for worker-reuse policy
+                fetcher = _DatasetKind.create_fetcher(dataset_kind, dataset, auto_collation, collate_fn, drop_last)
+                continue
+            elif r is None:
+                # Received the final signal
+                assert done_event.is_set() or iteration_end
+                break
+            elif done_event.is_set() or iteration_end:
+                # `done_event` is set. But I haven't received the final signal
+                # (None) yet. I will keep continuing until get it, and skip the
+                # processing steps.
+                continue
+            idx, index = r
+            """ Added """
+            RRSController.sample_resolution(batch_id=idx)
+            """ Added """
+            data: Union[_IterableDatasetStopIteration, ExceptionWrapper]
+            if init_exception is not None:
+                data = init_exception
+                init_exception = None
+            else:
+                try:
+                    data = fetcher.fetch(index)
+                except Exception as e:
+                    if isinstance(e, StopIteration) and dataset_kind == _DatasetKind.Iterable:
+                        data = _IterableDatasetStopIteration(worker_id)
+                        # Set `iteration_end`
+                        #   (1) to save future `next(...)` calls, and
+                        #   (2) to avoid sending multiple `_IterableDatasetStopIteration`s.
+                        iteration_end = True
+                    else:
+                        # It is important that we don't store exc_info in a variable.
+                        # `ExceptionWrapper` does the correct thing.
+                        # See NOTE [ Python Traceback Reference Cycle Problem ]
+                        data = ExceptionWrapper(where="in DataLoader worker process {}".format(worker_id))
+            data_queue.put((idx, data))
+            del data, idx, index, r  # save memory
+    except KeyboardInterrupt:
+        # Main process will raise KeyboardInterrupt anyways.
+        pass
+    if done_event.is_set():
+        data_queue.cancel_join_thread()
+        data_queue.close()

efficientvit/apps/data_provider/random_resolution/controller.py ADDED Viewed

	@@ -0,0 +1,92 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import copy
+import torch
+import torchvision.transforms as transforms
+import torchvision.transforms.functional as F
+from efficientvit.models.utils import torch_random_choices
+__all__ = [
+    "RRSController",
+    "get_interpolate",
+    "MyRandomResizedCrop",
+]
+class RRSController:
+    ACTIVE_SIZE = (224, 224)
+    IMAGE_SIZE_LIST = [(224, 224)]
+    CHOICE_LIST = None
+    @staticmethod
+    def get_candidates() -> list[tuple[int, int]]:
+        return copy.deepcopy(RRSController.IMAGE_SIZE_LIST)
+    @staticmethod
+    def sample_resolution(batch_id: int) -> None:
+        RRSController.ACTIVE_SIZE = RRSController.CHOICE_LIST[batch_id]
+    @staticmethod
+    def set_epoch(epoch: int, batch_per_epoch: int) -> None:
+        g = torch.Generator()
+        g.manual_seed(epoch)
+        RRSController.CHOICE_LIST = torch_random_choices(
+            RRSController.get_candidates(),
+            g,
+            batch_per_epoch,
+        )
+def get_interpolate(name: str) -> F.InterpolationMode:
+    mapping = {
+        "nearest": F.InterpolationMode.NEAREST,
+        "bilinear": F.InterpolationMode.BILINEAR,
+        "bicubic": F.InterpolationMode.BICUBIC,
+        "box": F.InterpolationMode.BOX,
+        "hamming": F.InterpolationMode.HAMMING,
+        "lanczos": F.InterpolationMode.LANCZOS,
+    }
+    if name in mapping:
+        return mapping[name]
+    elif name == "random":
+        return torch_random_choices(
+            [
+                F.InterpolationMode.NEAREST,
+                F.InterpolationMode.BILINEAR,
+                F.InterpolationMode.BICUBIC,
+                F.InterpolationMode.BOX,
+                F.InterpolationMode.HAMMING,
+                F.InterpolationMode.LANCZOS,
+            ],
+        )
+    else:
+        raise NotImplementedError
+class MyRandomResizedCrop(transforms.RandomResizedCrop):
+    def __init__(
+        self,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4.0, 4.0 / 3.0),
+        interpolation: str = "random",
+    ):
+        super(MyRandomResizedCrop, self).__init__(224, scale, ratio)
+        self.interpolation = interpolation
+    def forward(self, img: torch.Tensor) -> torch.Tensor:
+        i, j, h, w = self.get_params(img, list(self.scale), list(self.ratio))
+        target_size = RRSController.ACTIVE_SIZE
+        return F.resized_crop(img, i, j, h, w, list(target_size), get_interpolate(self.interpolation))
+    def __repr__(self) -> str:
+        format_string = self.__class__.__name__
+        format_string += f"(\n\tsize={RRSController.get_candidates()},\n"
+        format_string += f"\tscale={tuple(round(s, 4) for s in self.scale)},\n"
+        format_string += f"\tratio={tuple(round(r, 4) for r in self.ratio)},\n"
+        format_string += f"\tinterpolation={self.interpolation})"
+        return format_string

efficientvit/apps/setup.py ADDED Viewed

	@@ -0,0 +1,135 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import os
+import time
+from copy import deepcopy
+import torch.backends.cudnn
+import torch.distributed
+import torch.nn as nn
+from efficientvit.apps.data_provider import DataProvider
+from efficientvit.apps.trainer.run_config import RunConfig
+from efficientvit.apps.utils import (
+    dist_init,
+    dump_config,
+    get_dist_local_rank,
+    get_dist_rank,
+    get_dist_size,
+    init_modules,
+    is_master,
+    load_config,
+    partial_update_config,
+    zero_last_gamma,
+)
+from efficientvit.models.utils import build_kwargs_from_config, load_state_dict_from_file
+__all__ = [
+    "save_exp_config",
+    "setup_dist_env",
+    "setup_seed",
+    "setup_exp_config",
+    "setup_data_provider",
+    "setup_run_config",
+    "init_model",
+]
+def save_exp_config(exp_config: dict, path: str, name="config.yaml") -> None:
+    if not is_master():
+        return
+    dump_config(exp_config, os.path.join(path, name))
+def setup_dist_env(gpu: str or None = None) -> None:
+    if gpu is not None:
+        os.environ["CUDA_VISIBLE_DEVICES"] = gpu
+    if not torch.distributed.is_initialized():
+        dist_init()
+    torch.backends.cudnn.benchmark = True
+    torch.cuda.set_device(get_dist_local_rank())
+def setup_seed(manual_seed: int, resume: bool) -> None:
+    if resume:
+        manual_seed = int(time.time())
+    manual_seed = get_dist_rank() + manual_seed
+    torch.manual_seed(manual_seed)
+    torch.cuda.manual_seed_all(manual_seed)
+def setup_exp_config(config_path: str, recursive=True, opt_args: dict or None = None) -> dict:
+    # load config
+    if not os.path.isfile(config_path):
+        raise ValueError(config_path)
+    fpaths = [config_path]
+    if recursive:
+        extension = os.path.splitext(config_path)[1]
+        while os.path.dirname(config_path) != config_path:
+            config_path = os.path.dirname(config_path)
+            fpath = os.path.join(config_path, "default" + extension)
+            if os.path.isfile(fpath):
+                fpaths.append(fpath)
+        fpaths = fpaths[::-1]
+    default_config = load_config(fpaths[0])
+    exp_config = deepcopy(default_config)
+    for fpath in fpaths[1:]:
+        partial_update_config(exp_config, load_config(fpath))
+    # update config via args
+    if opt_args is not None:
+        partial_update_config(exp_config, opt_args)
+    return exp_config
+def setup_data_provider(
+    exp_config: dict, data_provider_classes: list[type[DataProvider]], is_distributed: bool = True
+) -> DataProvider:
+    dp_config = exp_config["data_provider"]
+    dp_config["num_replicas"] = get_dist_size() if is_distributed else None
+    dp_config["rank"] = get_dist_rank() if is_distributed else None
+    dp_config["test_batch_size"] = dp_config.get("test_batch_size", None) or dp_config["base_batch_size"] * 2
+    dp_config["batch_size"] = dp_config["train_batch_size"] = dp_config["base_batch_size"]
+    data_provider_lookup = {provider.name: provider for provider in data_provider_classes}
+    data_provider_class = data_provider_lookup[dp_config["dataset"]]
+    data_provider_kwargs = build_kwargs_from_config(dp_config, data_provider_class)
+    data_provider = data_provider_class(**data_provider_kwargs)
+    return data_provider
+def setup_run_config(exp_config: dict, run_config_cls: type[RunConfig]) -> RunConfig:
+    exp_config["run_config"]["init_lr"] = exp_config["run_config"]["base_lr"] * get_dist_size()
+    run_config = run_config_cls(**exp_config["run_config"])
+    return run_config
+def init_model(
+    network: nn.Module,
+    init_from: str or None = None,
+    backbone_init_from: str or None = None,
+    rand_init="trunc_normal",
+    last_gamma=None,
+) -> None:
+    # initialization
+    init_modules(network, init_type=rand_init)
+    # zero gamma of last bn in each block
+    if last_gamma is not None:
+        zero_last_gamma(network, last_gamma)
+    # load weight
+    if init_from is not None and os.path.isfile(init_from):
+        network.load_state_dict(load_state_dict_from_file(init_from))
+        print(f"Loaded init from {init_from}")
+    elif backbone_init_from is not None and os.path.isfile(backbone_init_from):
+        network.backbone.load_state_dict(load_state_dict_from_file(backbone_init_from))
+        print(f"Loaded backbone init from {backbone_init_from}")
+    else:
+        print(f"Random init ({rand_init}) with last gamma {last_gamma}")

efficientvit/apps/trainer/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+from .base import *
+from .run_config import *

efficientvit/apps/trainer/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (226 Bytes). View file

efficientvit/apps/trainer/__pycache__/base.cpython-310.pyc ADDED Viewed

Binary file (8.47 kB). View file

efficientvit/apps/trainer/__pycache__/run_config.cpython-310.pyc ADDED Viewed

Binary file (4.04 kB). View file

efficientvit/apps/trainer/base.py ADDED Viewed

	@@ -0,0 +1,299 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import os
+import torch
+import torch.nn as nn
+from efficientvit.apps.data_provider import DataProvider, parse_image_size
+from efficientvit.apps.trainer.run_config import RunConfig
+from efficientvit.apps.utils import EMA, dist_barrier, get_dist_local_rank, is_master
+from efficientvit.models.nn.norm import reset_bn
+from efficientvit.models.utils import is_parallel, load_state_dict_from_file
+__all__ = ["Trainer"]
+class Trainer:
+    def __init__(self, path: str, model: nn.Module, data_provider: DataProvider):
+        self.path = os.path.realpath(os.path.expanduser(path))
+        self.model = model.cuda()
+        self.data_provider = data_provider
+        self.ema = None
+        self.checkpoint_path = os.path.join(self.path, "checkpoint")
+        self.logs_path = os.path.join(self.path, "logs")
+        for path in [self.path, self.checkpoint_path, self.logs_path]:
+            os.makedirs(path, exist_ok=True)
+        self.best_val = 0.0
+        self.start_epoch = 0
+    @property
+    def network(self) -> nn.Module:
+        return self.model.module if is_parallel(self.model) else self.model
+    @property
+    def eval_network(self) -> nn.Module:
+        if self.ema is None:
+            model = self.model
+        else:
+            model = self.ema.shadows
+        model = model.module if is_parallel(model) else model
+        return model
+    def write_log(self, log_str, prefix="valid", print_log=True, mode="a") -> None:
+        if is_master():
+            fout = open(os.path.join(self.logs_path, f"{prefix}.log"), mode)
+            fout.write(log_str + "\n")
+            fout.flush()
+            fout.close()
+            if print_log:
+                print(log_str)
+    def save_model(
+        self,
+        checkpoint=None,
+        only_state_dict=True,
+        epoch=0,
+        model_name=None,
+    ) -> None:
+        if is_master():
+            if checkpoint is None:
+                if only_state_dict:
+                    checkpoint = {"state_dict": self.network.state_dict()}
+                else:
+                    checkpoint = {
+                        "state_dict": self.network.state_dict(),
+                        "epoch": epoch,
+                        "best_val": self.best_val,
+                        "optimizer": self.optimizer.state_dict(),
+                        "lr_scheduler": self.lr_scheduler.state_dict(),
+                        "ema": self.ema.state_dict() if self.ema is not None else None,
+                        "scaler": self.scaler.state_dict() if self.enable_amp else None,
+                    }
+            model_name = model_name or "checkpoint.pt"
+            latest_fname = os.path.join(self.checkpoint_path, "latest.txt")
+            model_path = os.path.join(self.checkpoint_path, model_name)
+            with open(latest_fname, "w") as _fout:
+                _fout.write(model_path + "\n")
+            torch.save(checkpoint, model_path)
+    def load_model(self, model_fname=None) -> None:
+        latest_fname = os.path.join(self.checkpoint_path, "latest.txt")
+        if model_fname is None and os.path.exists(latest_fname):
+            with open(latest_fname, "r") as fin:
+                model_fname = fin.readline()
+                if len(model_fname) > 0 and model_fname[-1] == "\n":
+                    model_fname = model_fname[:-1]
+        try:
+            if model_fname is None:
+                model_fname = f"{self.checkpoint_path}/checkpoint.pt"
+            elif not os.path.exists(model_fname):
+                model_fname = f"{self.checkpoint_path}/{os.path.basename(model_fname)}"
+                if not os.path.exists(model_fname):
+                    model_fname = f"{self.checkpoint_path}/checkpoint.pt"
+            print(f"=> loading checkpoint {model_fname}")
+            checkpoint = load_state_dict_from_file(model_fname, False)
+        except Exception:
+            self.write_log(f"fail to load checkpoint from {self.checkpoint_path}")
+            return
+        # load checkpoint
+        self.network.load_state_dict(checkpoint["state_dict"], strict=False)
+        log = []
+        if "epoch" in checkpoint:
+            self.start_epoch = checkpoint["epoch"] + 1
+            self.run_config.update_global_step(self.start_epoch)
+            log.append(f"epoch={self.start_epoch - 1}")
+        if "best_val" in checkpoint:
+            self.best_val = checkpoint["best_val"]
+            log.append(f"best_val={self.best_val:.2f}")
+        if "optimizer" in checkpoint:
+            self.optimizer.load_state_dict(checkpoint["optimizer"])
+            log.append("optimizer")
+        if "lr_scheduler" in checkpoint:
+            self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+            log.append("lr_scheduler")
+        if "ema" in checkpoint and self.ema is not None:
+            self.ema.load_state_dict(checkpoint["ema"])
+            log.append("ema")
+        if "scaler" in checkpoint and self.enable_amp:
+            self.scaler.load_state_dict(checkpoint["scaler"])
+            log.append("scaler")
+        self.write_log("Loaded: " + ", ".join(log))
+    """ validate """
+    def reset_bn(
+        self,
+        network: nn.Module or None = None,
+        subset_size: int = 16000,
+        subset_batch_size: int = 100,
+        data_loader=None,
+        progress_bar=False,
+    ) -> None:
+        network = network or self.network
+        if data_loader is None:
+            data_loader = []
+            for data in self.data_provider.build_sub_train_loader(subset_size, subset_batch_size):
+                if isinstance(data, list):
+                    data_loader.append(data[0])
+                elif isinstance(data, dict):
+                    data_loader.append(data["data"])
+                elif isinstance(data, torch.Tensor):
+                    data_loader.append(data)
+                else:
+                    raise NotImplementedError
+        network.eval()
+        reset_bn(
+            network,
+            data_loader,
+            sync=True,
+            progress_bar=progress_bar,
+        )
+    def _validate(self, model, data_loader, epoch) -> dict[str, any]:
+        raise NotImplementedError
+    def validate(self, model=None, data_loader=None, is_test=True, epoch=0) -> dict[str, any]:
+        model = model or self.eval_network
+        if data_loader is None:
+            if is_test:
+                data_loader = self.data_provider.test
+            else:
+                data_loader = self.data_provider.valid
+        model.eval()
+        return self._validate(model, data_loader, epoch)
+    def multires_validate(
+        self,
+        model=None,
+        data_loader=None,
+        is_test=True,
+        epoch=0,
+        eval_image_size=None,
+    ) -> dict[str, dict[str, any]]:
+        eval_image_size = eval_image_size or self.run_config.eval_image_size
+        eval_image_size = eval_image_size or self.data_provider.image_size
+        model = model or self.eval_network
+        if not isinstance(eval_image_size, list):
+            eval_image_size = [eval_image_size]
+        output_dict = {}
+        for r in eval_image_size:
+            self.data_provider.assign_active_image_size(parse_image_size(r))
+            if self.run_config.reset_bn:
+                self.reset_bn(
+                    network=model,
+                    subset_size=self.run_config.reset_bn_size,
+                    subset_batch_size=self.run_config.reset_bn_batch_size,
+                    progress_bar=True,
+                )
+            output_dict[f"r{r}"] = self.validate(model, data_loader, is_test, epoch)
+        return output_dict
+    """ training """
+    def prep_for_training(self, run_config: RunConfig, ema_decay: float or None = None, amp="fp32") -> None:
+        self.run_config = run_config
+        self.model = nn.parallel.DistributedDataParallel(
+            self.model.cuda(),
+            device_ids=[get_dist_local_rank()],
+            static_graph=True,
+        )
+        self.run_config.global_step = 0
+        self.run_config.batch_per_epoch = len(self.data_provider.train)
+        assert self.run_config.batch_per_epoch > 0, "Training set is empty"
+        # build optimizer
+        self.optimizer, self.lr_scheduler = self.run_config.build_optimizer(self.model)
+        if ema_decay is not None:
+            self.ema = EMA(self.network, ema_decay)
+        # amp
+        self.amp = amp
+        self.scaler = torch.cuda.amp.GradScaler(enabled=self.enable_amp)
+    @property
+    def enable_amp(self) -> bool:
+        return self.amp != "fp32"
+    @property
+    def amp_dtype(self) -> torch.dtype:
+        if self.amp == "fp16":
+            return torch.float16
+        elif self.amp == "bf16":
+            return torch.bfloat16
+        else:
+            return torch.float32
+    def sync_model(self):
+        print("Sync model")
+        self.save_model(model_name="sync.pt")
+        dist_barrier()
+        checkpoint = torch.load(os.path.join(self.checkpoint_path, "sync.pt"), map_location="cpu")
+        dist_barrier()
+        if is_master():
+            os.remove(os.path.join(self.checkpoint_path, "sync.pt"))
+        dist_barrier()
+        # load checkpoint
+        self.network.load_state_dict(checkpoint["state_dict"], strict=False)
+        if "optimizer" in checkpoint:
+            self.optimizer.load_state_dict(checkpoint["optimizer"])
+        if "lr_scheduler" in checkpoint:
+            self.lr_scheduler.load_state_dict(checkpoint["lr_scheduler"])
+        if "ema" in checkpoint and self.ema is not None:
+            self.ema.load_state_dict(checkpoint["ema"])
+        if "scaler" in checkpoint and self.enable_amp:
+            self.scaler.load_state_dict(checkpoint["scaler"])
+    def before_step(self, feed_dict: dict[str, any]) -> dict[str, any]:
+        for key in feed_dict:
+            if isinstance(feed_dict[key], torch.Tensor):
+                feed_dict[key] = feed_dict[key].cuda()
+        return feed_dict
+    def run_step(self, feed_dict: dict[str, any]) -> dict[str, any]:
+        raise NotImplementedError
+    def after_step(self) -> None:
+        self.scaler.unscale_(self.optimizer)
+        # gradient clip
+        if self.run_config.grad_clip is not None:
+            torch.nn.utils.clip_grad_value_(self.model.parameters(), self.run_config.grad_clip)
+        # update
+        self.scaler.step(self.optimizer)
+        self.scaler.update()
+        self.lr_scheduler.step()
+        self.run_config.step()
+        # update ema
+        if self.ema is not None:
+            self.ema.step(self.network, self.run_config.global_step)
+    def _train_one_epoch(self, epoch: int) -> dict[str, any]:
+        raise NotImplementedError
+    def train_one_epoch(self, epoch: int) -> dict[str, any]:
+        self.model.train()
+        self.data_provider.set_epoch(epoch)
+        train_info_dict = self._train_one_epoch(epoch)
+        return train_info_dict
+    def train(self) -> None:
+        raise NotImplementedError

efficientvit/apps/trainer/run_config.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+import json
+import numpy as np
+import torch.nn as nn
+from efficientvit.apps.utils import CosineLRwithWarmup, build_optimizer
+__all__ = ["Scheduler", "RunConfig"]
+class Scheduler:
+    PROGRESS = 0
+class RunConfig:
+    n_epochs: int
+    init_lr: float
+    warmup_epochs: int
+    warmup_lr: float
+    lr_schedule_name: str
+    lr_schedule_param: dict
+    optimizer_name: str
+    optimizer_params: dict
+    weight_decay: float
+    no_wd_keys: list
+    grad_clip: float  # allow none to turn off grad clipping
+    reset_bn: bool
+    reset_bn_size: int
+    reset_bn_batch_size: int
+    eval_image_size: list  # allow none to use image_size in data_provider
+    @property
+    def none_allowed(self):
+        return ["grad_clip", "eval_image_size"]
+    def __init__(self, **kwargs):  # arguments must be passed as kwargs
+        for k, val in kwargs.items():
+            setattr(self, k, val)
+        # check that all relevant configs are there
+        annotations = {}
+        for clas in type(self).mro():
+            if hasattr(clas, "__annotations__"):
+                annotations.update(clas.__annotations__)
+        for k, k_type in annotations.items():
+            assert hasattr(self, k), f"Key {k} with type {k_type} required for initialization."
+            attr = getattr(self, k)
+            if k in self.none_allowed:
+                k_type = (k_type, type(None))
+            assert isinstance(attr, k_type), f"Key {k} must be type {k_type}, provided={attr}."
+        self.global_step = 0
+        self.batch_per_epoch = 1
+    def build_optimizer(self, network: nn.Module) -> tuple[any, any]:
+        r"""require setting 'batch_per_epoch' before building optimizer & lr_scheduler"""
+        param_dict = {}
+        for name, param in network.named_parameters():
+            if param.requires_grad:
+                opt_config = [self.weight_decay, self.init_lr]
+                if self.no_wd_keys is not None and len(self.no_wd_keys) > 0:
+                    if np.any([key in name for key in self.no_wd_keys]):
+                        opt_config[0] = 0
+                opt_key = json.dumps(opt_config)
+                param_dict[opt_key] = param_dict.get(opt_key, []) + [param]
+        net_params = []
+        for opt_key, param_list in param_dict.items():
+            wd, lr = json.loads(opt_key)
+            net_params.append({"params": param_list, "weight_decay": wd, "lr": lr})
+        optimizer = build_optimizer(net_params, self.optimizer_name, self.optimizer_params, self.init_lr)
+        # build lr scheduler
+        if self.lr_schedule_name == "cosine":
+            decay_steps = []
+            for epoch in self.lr_schedule_param.get("step", []):
+                decay_steps.append(epoch * self.batch_per_epoch)
+            decay_steps.append(self.n_epochs * self.batch_per_epoch)
+            decay_steps.sort()
+            lr_scheduler = CosineLRwithWarmup(
+                optimizer,
+                self.warmup_epochs * self.batch_per_epoch,
+                self.warmup_lr,
+                decay_steps,
+            )
+        else:
+            raise NotImplementedError
+        return optimizer, lr_scheduler
+    def update_global_step(self, epoch, batch_id=0) -> None:
+        self.global_step = epoch * self.batch_per_epoch + batch_id
+        Scheduler.PROGRESS = self.progress
+    @property
+    def progress(self) -> float:
+        warmup_steps = self.warmup_epochs * self.batch_per_epoch
+        steps = max(0, self.global_step - warmup_steps)
+        return steps / (self.n_epochs * self.batch_per_epoch)
+    def step(self) -> None:
+        self.global_step += 1
+        Scheduler.PROGRESS = self.progress
+    def get_remaining_epoch(self, epoch, post=True) -> int:
+        return self.n_epochs + self.warmup_epochs - epoch - int(post)
+    def epoch_format(self, epoch: int) -> str:
+        epoch_format = f"%.{len(str(self.n_epochs))}d"
+        epoch_format = f"[{epoch_format}/{epoch_format}]"
+        epoch_format = epoch_format % (epoch + 1 - self.warmup_epochs, self.n_epochs)
+        return epoch_format

efficientvit/apps/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# EfficientViT: Multi-Scale Linear Attention for High-Resolution Dense Prediction
+# Han Cai, Junyan Li, Muyan Hu, Chuang Gan, Song Han
+# International Conference on Computer Vision (ICCV), 2023
+from .dist import *
+from .ema import *
+from .export import *
+from .init import *
+from .lr import *
+from .metric import *
+from .misc import *
+from .opt import *

efficientvit/apps/utils/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (314 Bytes). View file

efficientvit/apps/utils/__pycache__/dist.cpython-310.pyc ADDED Viewed

Binary file (2.13 kB). View file

efficientvit/apps/utils/__pycache__/ema.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

efficientvit/apps/utils/__pycache__/export.cpython-310.pyc ADDED Viewed

Binary file (1.35 kB). View file

efficientvit/apps/utils/__pycache__/init.cpython-310.pyc ADDED Viewed

Binary file (2.01 kB). View file

efficientvit/apps/utils/__pycache__/lr.cpython-310.pyc ADDED Viewed

Binary file (1.74 kB). View file

efficientvit/apps/utils/__pycache__/metric.cpython-310.pyc ADDED Viewed

Binary file (1.61 kB). View file