Upload model

Browse files

Files changed (8) hide show

adaptor_generic.py +29 -0
adaptor_mlp.py +150 -0
adaptor_registry.py +37 -0
eradio_model.py +18 -431
hf_model.py +13 -42
open_clip_adaptor.py +41 -0
radio_model.py +1 -7
vitdet.py +173 -0

adaptor_generic.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .adaptor_base import AdaptorBase, AdaptorInput, RadioOutput
+from .adaptor_mlp import create_mlp_from_state
+class GenericAdaptor(AdaptorBase):
+    def __init__(self, main_config: Namespace, adaptor_config, state):
+        super().__init__()
+        self.head_mlp = create_mlp_from_state(main_config.mlp_version, state, 'summary.')
+        self.feat_mlp = create_mlp_from_state(main_config.mlp_version, state, 'feature.')
+    def forward(self, input: AdaptorInput) -> RadioOutput:
+        summary = self.head_mlp(input.summary)
+        feat = self.feat_mlp(input.features)
+        return RadioOutput(summary, feat)

adaptor_mlp.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import math
+from typing import Dict
+import torch
+from torch import nn
+from einops import rearrange
+from timm.models.vision_transformer import Block
+class MLP(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int,
+                 num_inner: int = 0, device: torch.device = None, **kwargs):
+        super(MLP, self).__init__()
+        self.fc1 = nn.Linear(input_size, hidden_size, device=device)
+        self.norm = nn.LayerNorm(hidden_size, device=device)
+        self.relu = nn.ReLU()
+        inner = []
+        for _ in range(num_inner):
+            inner.extend([
+                nn.Linear(hidden_size, hidden_size, device=device),
+                nn.LayerNorm(hidden_size, device=device),
+                nn.ReLU(),
+            ])
+        if inner:
+            self.inner = nn.Sequential(*inner)
+        else:
+            self.inner = nn.Identity()
+        self.fc2 = nn.Linear(hidden_size, output_size, device=device)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = self.norm(x)
+        x = self.relu(x)
+        x = self.inner(x)
+        x = self.fc2(x)
+        return x
+class MLP2(nn.Module):
+    def __init__(self, input_size: int, hidden_size: int, output_size: int,
+                 num_inner: int = 0,
+                 pre_norm: bool = False, device: torch.device = None,
+                 upsample_factor: int = 1,
+                 **kwargs):
+        super().__init__()
+        self.pre_norm = nn.Sequential(
+            nn.LayerNorm(input_size),
+            nn.GELU(),
+        ) if pre_norm else nn.Identity()
+        self.upsample_factor = upsample_factor
+        self._real_output_dim = output_size
+        hidden_size *= upsample_factor
+        output_size *= (upsample_factor ** 2)
+        self.fc1 = nn.Linear(input_size, hidden_size, device=device)
+        blocks = []
+        for _ in range(num_inner):
+            blocks.append(nn.Sequential(
+                nn.LayerNorm(hidden_size, device=device),
+                nn.GELU(),
+                nn.Linear(hidden_size, hidden_size, device=device),
+            ))
+        self.blocks = nn.ModuleList(blocks)
+        self.final = nn.Sequential(
+            nn.LayerNorm(hidden_size, device=device),
+            nn.GELU(),
+            nn.Linear(hidden_size, output_size, device=device),
+        )
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.pre_norm(x)
+        x = self.fc1(x)
+        for block in self.blocks:
+            x = x + block(x)
+        x = self.final(x)
+        if self.upsample_factor > 1:
+            h = w = int(math.sqrt(x.shape[1]))
+            x = rearrange(x, 'b (h w) (u1 u2 c) -> b (u1 h u2 w) c',
+                          h=h, w=w, u1=self.upsample_factor, u2=self.upsample_factor,
+                          c=self._real_output_dim)
+        return x
+MLP_FACTORY = {
+    'v1': MLP,
+    'v2': MLP2,
+}
+def strip_prefix(state: Dict[str, torch.Tensor], prefix: str):
+    state = {
+        k[len(prefix):]: v
+        for k, v in state.items()
+        if k.startswith(prefix)
+    }
+    return state
+def get_mlp_info_from_state(version: str, state: Dict[str, torch.Tensor], prefix: str = ''):
+    state = strip_prefix(state, prefix)
+    if version == 'v1':
+        hidden_dim, input_dim = state['fc1.weight'].shape
+        output_dim = state['fc2.weight'].shape[0]
+        for num_inner in range(1000):
+            k = f'inner.{num_inner}.0.weight'
+            if k not in state:
+                break
+    elif version == 'v2':
+        hidden_dim, input_dim = state['fc1.weight'].shape
+        output_dim = state['final.2.weight'].shape[0]
+        for num_inner in range(1000):
+            k = f'blocks.{num_inner}.0.weight'
+            if k not in state:
+                break
+    else:
+        raise ValueError(f'Unsupported MLP version: {version}')
+    return input_dim, hidden_dim, output_dim, num_inner
+def create_mlp_from_state(version: str, state: Dict[str, torch.Tensor], prefix: str = ''):
+    state = strip_prefix(state, prefix)
+    input_dim, hidden_dim, output_dim, num_inner = get_mlp_info_from_state(version, state)
+    ret: nn.Module = MLP_FACTORY[version](input_dim, hidden_dim, output_dim, num_inner)
+    ret.load_state_dict(state)
+    return ret

adaptor_registry.py ADDED Viewed

	@@ -0,0 +1,37 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+from typing import Dict, Any
+import torch
+from .adaptor_generic import GenericAdaptor, AdaptorBase
+dict_t = Dict[str, Any]
+state_t = Dict[str, torch.Tensor]
+class AdaptorRegistry:
+    def __init__(self):
+        self._registry = {}
+    def register_adaptor(self, name):
+        def decorator(factory_function):
+            if name in self._registry:
+                raise ValueError(f"Model '{name}' already registered")
+            self._registry[name] = factory_function
+            return factory_function
+        return decorator
+    def create_adaptor(self, name, main_config: Namespace, adaptor_config: dict_t, state: state_t) -> AdaptorBase:
+        if name not in self._registry:
+            return GenericAdaptor(main_config, adaptor_config, state)
+        return self._registry[name](main_config, adaptor_config, state)
+# Creating an instance of the registry
+adaptor_registry = AdaptorRegistry()

eradio_model.py CHANGED Viewed

@@ -8,7 +8,7 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-# E-RADIO (FasterViTv2) model from
 # Mike Ranzinger, Greg Heinrich, Jan Kautz, and Pavlo Molchanov. "AM-RADIO: Agglomerative Model--Reduce All Domains Into One." arXiv preprint arXiv:2312.06709 (2023).
 # based on FasterViT, Swin Transformer, YOLOv8
@@ -638,7 +638,7 @@ class Downsample(nn.Module):
         else:
             # removed layer norm for better, in this formulation we are getting 10% better speed
             # LayerNorm for high resolution inputs will be a pain as it pools over the entire spatial dimension
-            # therefore we remove it compared to the original implementation in FasterViTv1
             self.norm = nn.Identity()
             self.reduction = Conv2d_BN(dim, dim_out, 3, 2, 1, bias=False)
@@ -790,9 +790,9 @@ class WindowAttention(nn.Module):
-class FasterViTLayer(nn.Module):
     """
-    fastervitlayer
     """
     def __init__(self,
@@ -960,7 +960,7 @@ class InterpolateLayer(nn.Module):
 class HiResNeck(nn.Module):
     """
     The block is used to output dense features from all stages
-    Otherwise, by default, only the last stage features are returned with FasterViTv2
     """
     def __init__(self, dim, depths, neck_start_stage, full_features_head_dim, downsample_enabled):
@@ -1017,9 +1017,9 @@ class HiResNeck(nn.Module):
             full_features = full_features + feature_projection
         return full_features
-class FasterViT(nn.Module):
     """
-    FasterViT
     """
     def __init__(self,
@@ -1104,7 +1104,7 @@ class FasterViT(nn.Module):
         for i in range(len(depths)):
             conv = True if (i == 0 or i == 1) else False
-            level = FasterViTLayer(dim=int(dim * 2 ** i),
                                    depth=depths[i],
                                    num_heads=num_heads[i],
                                    window_size=window_size[i],
@@ -1208,9 +1208,9 @@ class FasterViT(nn.Module):
     def change_window_size(self, new_window_size):
         """
-        FasterViT employs windowed attention, which may be sensitive to the choice of this parameter,
         especially in cases of uneven partitioning of the feature maps.
-        FasterViT allows for the adjustment of the window size after training,
         making it adaptable to different input image resolutions.
         The recommended values for window size based on input resolution are as follows:
@@ -1243,9 +1243,9 @@ class FasterViT(nn.Module):
         """
         Using hand picked window size for various resolutions.
-        FasterViT employs windowed attention, which may be sensitive to the choice of this parameter,
         especially in cases of uneven partitioning of the feature maps.
-        FasterViT allows for the adjustment of the window size after training,
         making it adaptable to different input image resolutions.
         The recommended values for window size based on input resolution are as follows:
@@ -1288,271 +1288,10 @@ class FasterViT(nn.Module):
         self.change_window_size(new_window_size = new_window_size)
-# 83.44200001953125
-@register_model
-def fastervit2_small(pretrained=False, **kwargs): #,
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=96,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [1, 2], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-# 82.61
-@register_model
-def fastervit2_tiny(pretrained=False, **kwargs): #,
-    model = FasterViT(depths=[1, 3, 4, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=80,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-#'top1', 84.31800001220704
-@register_model
-def fastervit2_base(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-#84.39999999267579
-@register_model
-def fastervit2_base_v1(pretrained=False, **kwargs):
-    model = FasterViT(depths=[4, 4, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     downsample_shuffle=False,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_base_fullres1(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     use_neck=True,
-                     full_features_head_dim=1024,
-                     neck_start_stage=2,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_base_fullres2(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     use_neck=True,
-                     full_features_head_dim=512,
-                     neck_start_stage=1,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_base_fullres3(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     use_neck=True,
-                     full_features_head_dim=256,
-                     neck_start_stage=1,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_base_fullres4(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     use_neck=True,
-                     full_features_head_dim=256,
-                     neck_start_stage=2,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_base_fullres5(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     conv_base=True,
-                     use_neck=True,
-                     full_features_head_dim=512,
-                     neck_start_stage=2,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-#84.87
 @register_model
-def fastervit2_large(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128+64,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.3,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=False,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     conv_base=True,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_large_fullres(pretrained=False, **kwargs):
-    model = FasterViT(
-        depths=[3, 3, 5, 5],
-        num_heads=[2, 4, 8, 16],
-        window_size=[None, None, [7, 7], 7],
-        dim=192,
-        in_dim=64,
-        mlp_ratio=4,
-        drop_path_rate=0.0,
-        sr_ratio=[1, 1, [2, 1], 1],
-        use_swiglu=False,
-        yolo_arch=True,
-        shuffle_down=False,
-        conv_base=True,
-        use_neck=True,
-        full_features_head_dim=1536,
-        neck_start_stage=2,
-        **kwargs,
-    )
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_large_fullres_ws8(pretrained=False, **kwargs):
-    model = FasterViT(
-        depths=[3, 3, 5, 5],
-        num_heads=[2, 4, 8, 16],
-        window_size=[None, None, [8, 8], 8],
-        dim=192,
-        in_dim=64,
-        mlp_ratio=4,
-        drop_path_rate=0.0,
-        sr_ratio=[1, 1, [2, 1], 1],
-        use_swiglu=False,
-        yolo_arch=True,
-        shuffle_down=False,
-        conv_base=True,
-        use_neck=True,
-        full_features_head_dim=1536,
-        neck_start_stage=2,
-        **kwargs,
-    )
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_large_fullres_ws16(pretrained=False, **kwargs):
-    model = FasterViT(
         depths=[3, 3, 5, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
@@ -1575,161 +1314,9 @@ def fastervit2_large_fullres_ws16(pretrained=False, **kwargs):
     return model
-@register_model
-def fastervit2_large_fullres_ws32(pretrained=False, **kwargs):
-    model = FasterViT(
-        depths=[3, 3, 5, 5],
-        num_heads=[2, 4, 8, 16],
-        window_size=[None, None, [32, 32], 32],
-        dim=192,
-        in_dim=64,
-        mlp_ratio=4,
-        drop_path_rate=0.0,
-        sr_ratio=[1, 1, [2, 1], 1],
-        use_swiglu=False,
-        yolo_arch=True,
-        shuffle_down=False,
-        conv_base=True,
-        use_neck=True,
-        full_features_head_dim=1536,
-        neck_start_stage=2,
-        **kwargs,
-    )
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-#85.23% top1
-@register_model
-def fastervit2_xlarge(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128+128+64,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.4,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=False,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_huge(pretrained=False, **kwargs):
-    model = FasterViT(depths=[3, 3, 5, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=128+128+128+64,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.2,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-# 81.61
-@register_model
-def fastervit2_xtiny(pretrained=False, **kwargs): #,
-    model = FasterViT(depths=[1, 3, 4, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=64,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.1,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-# 80.19
-@register_model
-def fastervit2_xxtiny(pretrained=False, **kwargs): #,
-    model = FasterViT(depths=[1, 3, 4, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=48,
-                     in_dim=64,
-                     mlp_ratio=4,
-                     drop_path_rate=0.05,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-# 77.0
-def fastervit2_xxxtiny(pretrained=False, **kwargs): #,
-    model = FasterViT(depths=[1, 3, 4, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=32,
-                     in_dim=32,
-                     mlp_ratio=4,
-                     drop_path_rate=0.0,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
-@register_model
-def fastervit2_xxxtiny_fullres(pretrained=False, **kwargs):
-    model = FasterViT(depths=[1, 3, 4, 5],
-                     num_heads=[2, 4, 8, 16],
-                     window_size=[8, 8, [7, 7], 7],
-                     dim=32,
-                     in_dim=32,
-                     mlp_ratio=4,
-                     drop_path_rate=0.0,
-                     sr_ratio=[1, 1, [2, 1], 1],
-                     use_swiglu=False,
-                     downsample_shuffle=False,
-                     yolo_arch=True,
-                     shuffle_down=False,
-                     cpb_mlp_hidden=64,
-                     use_neck=True,
-                     full_features_head_dim=128,
-                     neck_start_stage=1,
-                     conv_groups_ratio = 1,
-                     **kwargs)
-    if pretrained:
-        model.load_state_dict(torch.load(pretrained)["state_dict"])
-    return model
 @register_model
 def eradio_xxxtiny(pretrained=False, **kwargs):  # ,
-    model = FasterViT(
         depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
@@ -1753,7 +1340,7 @@ def eradio_xxxtiny(pretrained=False, **kwargs):  # ,
 @register_model
 def eradio_xxxtiny_8x_ws12(pretrained=False, **kwargs):
-    model = FasterViT(depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [12, 12], 12],
         dim=32,
@@ -1778,7 +1365,7 @@ def eradio_xxxtiny_8x_ws12(pretrained=False, **kwargs):
 @register_model
 def eradio_xxxtiny_8x_ws16(pretrained=False, **kwargs):
-    model = FasterViT(depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
         dim=32,
@@ -1802,4 +1389,4 @@ def eradio_xxxtiny_8x_ws16(pretrained=False, **kwargs):
 @register_model
 def eradio(pretrained=False, **kwargs):
-    return fastervit2_large_fullres_ws16(pretrained=pretrained, **kwargs)

 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+# E-RADIO model from
 # Mike Ranzinger, Greg Heinrich, Jan Kautz, and Pavlo Molchanov. "AM-RADIO: Agglomerative Model--Reduce All Domains Into One." arXiv preprint arXiv:2312.06709 (2023).
 # based on FasterViT, Swin Transformer, YOLOv8
         else:
             # removed layer norm for better, in this formulation we are getting 10% better speed
             # LayerNorm for high resolution inputs will be a pain as it pools over the entire spatial dimension
+            # therefore we remove it compared to the original implementation in FasterViT
             self.norm = nn.Identity()
             self.reduction = Conv2d_BN(dim, dim_out, 3, 2, 1, bias=False)
+class ERADIOLayer(nn.Module):
     """
+    E-RADIO Layer
     """
     def __init__(self,
 class HiResNeck(nn.Module):
     """
     The block is used to output dense features from all stages
+    Otherwise, by default, only the last stage features are returned with E-RADIO
     """
     def __init__(self, dim, depths, neck_start_stage, full_features_head_dim, downsample_enabled):
             full_features = full_features + feature_projection
         return full_features
+class ERADIO(nn.Module):
     """
+    Efficient RADIO
     """
     def __init__(self,
         for i in range(len(depths)):
             conv = True if (i == 0 or i == 1) else False
+            level = ERADIOLayer(dim=int(dim * 2 ** i),
                                    depth=depths[i],
                                    num_heads=num_heads[i],
                                    window_size=window_size[i],
     def change_window_size(self, new_window_size):
         """
+        E-RADIO employs windowed attention, which may be sensitive to the choice of this parameter,
         especially in cases of uneven partitioning of the feature maps.
+        E-RADIO allows for the adjustment of the window size after training,
         making it adaptable to different input image resolutions.
         The recommended values for window size based on input resolution are as follows:
         """
         Using hand picked window size for various resolutions.
+        E-RADIO employs windowed attention, which may be sensitive to the choice of this parameter,
         especially in cases of uneven partitioning of the feature maps.
+        E-RADIO allows for the adjustment of the window size after training,
         making it adaptable to different input image resolutions.
         The recommended values for window size based on input resolution are as follows:
         self.change_window_size(new_window_size = new_window_size)
 @register_model
+def eradio_large_fullres_ws16(pretrained=False, **kwargs):
+    model = ERADIO(
         depths=[3, 3, 5, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
     return model
 @register_model
 def eradio_xxxtiny(pretrained=False, **kwargs):  # ,
+    model = ERADIO(
         depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
 @register_model
 def eradio_xxxtiny_8x_ws12(pretrained=False, **kwargs):
+    model = ERADIO(depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [12, 12], 12],
         dim=32,
 @register_model
 def eradio_xxxtiny_8x_ws16(pretrained=False, **kwargs):
+    model = ERADIO(depths=[1, 3, 4, 5],
         num_heads=[2, 4, 8, 16],
         window_size=[None, None, [16, 16], 16],
         dim=32,
 @register_model
 def eradio(pretrained=False, **kwargs):
+    return eradio_large_fullres_ws16(pretrained=pretrained, **kwargs)

hf_model.py CHANGED Viewed

@@ -12,22 +12,30 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import namedtuple
-from typing import Callable, Optional, List, Union
 from timm.models import VisionTransformer
 import torch
-from torch import nn
 from transformers import PretrainedConfig, PreTrainedModel
 from .common import RESOURCE_MAP, DEFAULT_VERSION
-# Force import of eradio_model in order to register it.
 from .eradio_model import eradio
 from .radio_model import create_model_from_args
 from .radio_model import RADIOModel as RADIOModelBase, Resolution
 from .input_conditioner import get_default_conditioner, InputConditioner
 # Register extra models
 from .extra_timm_models import *
@@ -75,7 +83,7 @@ class RADIOModel(PreTrainedModel):
     config_class = RADIOConfig
-    def __init__(self, config: RADIOConfig):
         super().__init__(config)
         RADIOArgs = namedtuple("RADIOArgs", config.args.keys())
@@ -116,10 +124,6 @@ class RADIOModel(PreTrainedModel):
             adaptors=adaptors,
         )
-    @property
-    def adaptors(self) -> nn.ModuleDict:
-        return self.radio_model.adaptors
     @property
     def model(self) -> VisionTransformer:
         return self.radio_model.model
@@ -128,38 +132,5 @@ class RADIOModel(PreTrainedModel):
     def input_conditioner(self) -> InputConditioner:
         return self.radio_model.input_conditioner
-    @property
-    def num_summary_tokens(self) -> int:
-        return self.radio_model.num_summary_tokens
-    @property
-    def patch_size(self) -> int:
-        return self.radio_model.patch_size
-    @property
-    def max_resolution(self) -> int:
-        return self.radio_model.max_resolution
-    @property
-    def preferred_resolution(self) -> Resolution:
-        return self.radio_model.preferred_resolution
-    @property
-    def window_size(self) -> int:
-        return self.radio_model.window_size
-    @property
-    def min_resolution_step(self) -> int:
-        return self.radio_model.min_resolution_step
-    def make_preprocessor_external(self) -> Callable[[torch.Tensor], torch.Tensor]:
-        return self.radio_model.make_preprocessor_external()
-    def get_nearest_supported_resolution(self, height: int, width: int) -> Resolution:
-        return self.radio_model.get_nearest_supported_resolution(height, width)
-    def switch_to_deploy(self):
-        return self.radio_model.switch_to_deploy()
     def forward(self, x: torch.Tensor):
         return self.radio_model.forward(x)

 # See the License for the specific language governing permissions and
 # limitations under the License.
 from collections import namedtuple
+from typing import Optional, List, Union
 from timm.models import VisionTransformer
 import torch
 from transformers import PretrainedConfig, PreTrainedModel
 from .common import RESOURCE_MAP, DEFAULT_VERSION
+# Import all required modules.
+from .adaptor_base import AdaptorBase, RadioOutput, AdaptorInput
+from .adaptor_generic import GenericAdaptor, AdaptorBase
+from .adaptor_mlp import create_mlp_from_state
+from .adaptor_registry import adaptor_registry
+from .cls_token import ClsToken
+from .enable_cpe_support import enable_cpe
+from .enable_spectral_reparam import configure_spectral_reparam_from_args
 from .eradio_model import eradio
 from .radio_model import create_model_from_args
 from .radio_model import RADIOModel as RADIOModelBase, Resolution
 from .input_conditioner import get_default_conditioner, InputConditioner
+from .open_clip_adaptor import OpenCLIP_RADIO
+from .vit_patch_generator import ViTPatchGenerator
+from .vitdet import apply_vitdet_arch, VitDetArgs
 # Register extra models
 from .extra_timm_models import *
     config_class = RADIOConfig
+    def __init__(self, config):
         super().__init__(config)
         RADIOArgs = namedtuple("RADIOArgs", config.args.keys())
             adaptors=adaptors,
         )
     @property
     def model(self) -> VisionTransformer:
         return self.radio_model.model
     def input_conditioner(self) -> InputConditioner:
         return self.radio_model.input_conditioner
     def forward(self, x: torch.Tensor):
         return self.radio_model.forward(x)

open_clip_adaptor.py ADDED Viewed

	@@ -0,0 +1,41 @@

+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+from argparse import Namespace
+import torch
+from torch import nn
+import torch.nn.functional as F
+from .adaptor_registry import adaptor_registry, dict_t, state_t
+from .adaptor_generic import GenericAdaptor
+class OpenCLIP_RADIO(GenericAdaptor):
+    def __init__(self, main_config: Namespace, adaptor_config: dict_t, state: state_t):
+        super().__init__(main_config, adaptor_config, state)
+        import open_clip
+        self.oc_model = open_clip.create_model_from_pretrained(
+            model_name=adaptor_config['model'],
+            pretrained=adaptor_config['pretrained'],
+            return_transform=False,
+        )
+        # Unload these parameters
+        self.oc_model.visual = None
+        self.tokenizer = open_clip.get_tokenizer(model_name=adaptor_config['model'])
+    def encode_text(self, text, normalize: bool = False):
+        return self.oc_model.encode_text(text, normalize=normalize)
+@adaptor_registry.register_adaptor("open_clip")
+def create_open_clip_adaptor(main_config: Namespace, adaptor_config: dict_t, state: state_t):
+    return OpenCLIP_RADIO(main_config, adaptor_config, state)

radio_model.py CHANGED Viewed

@@ -107,12 +107,6 @@ class RADIOModel(nn.Module):
             fn()
     def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
-        res_step = self.min_resolution_step
-        if res_step is not None and (x.shape[-2] % res_step != 0 or x.shape[-1] % res_step != 0):
-            raise ValueError('The input resolution must be a multiple of `self.min_resolution_step`. '
-                             '`self.get_nearest_supported_resolution(<height>, <width>) is provided as a convenience API. '
-                             f'Input: {x.shape[-2:]}, Nearest: {self.get_nearest_supported_resolution(*x.shape[-2:])}')
         x = self.input_conditioner(x)
         y = self.model.forward_features(x)
@@ -133,7 +127,7 @@ class RADIOModel(nn.Module):
                 all_summary = y[:, 0]
                 bb_summary = all_summary
                 all_feat = y[:, 1:]
-        elif isinstance(self.model, eradio_model.FasterViT):
             _, f = y
             all_feat = f.flatten(2).transpose(1, 2)
             all_summary = all_feat.mean(dim=1)

             fn()
     def forward(self, x: torch.Tensor) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         x = self.input_conditioner(x)
         y = self.model.forward_features(x)
                 all_summary = y[:, 0]
                 bb_summary = all_summary
                 all_feat = y[:, 1:]
+        elif isinstance(self.model, eradio_model.ERADIO):
             _, f = y
             all_feat = f.flatten(2).transpose(1, 2)
             all_summary = all_feat.mean(dim=1)

vitdet.py ADDED Viewed

	@@ -0,0 +1,173 @@

+from collections import defaultdict
+from contextlib import contextmanager
+from logging import getLogger
+import math
+import sys
+from typing import List, Union, Iterable
+import numpy as np
+import torch
+from torch import nn
+from timm.models import VisionTransformer
+from einops import rearrange
+DEFAULT_NUM_WINDOWED = 5
+class VitDetArgs:
+    def __init__(self,
+                 window_size: int,
+                 num_summary_tokens: int,
+                 num_windowed: int = DEFAULT_NUM_WINDOWED,
+    ):
+        self.window_size = window_size
+        self.num_summary_tokens = num_summary_tokens
+        self.num_windowed = num_windowed
+def apply_vitdet_arch(model: VisionTransformer, args: VitDetArgs):
+    if isinstance(model, VisionTransformer):
+        patch_embed = getattr(model, 'patch_generator', model.patch_embed)
+        return ViTDetHook(patch_embed, model.blocks, args)
+    else:
+        print(f'Warning: Unable to apply VitDet aug!', file=sys.stderr)
+class ViTDetHook:
+    def __init__(self,
+                 embedder: nn.Module,
+                 blocks: nn.Sequential,
+                 args: VitDetArgs,
+    ):
+        self.blocks = blocks
+        self.num_summary_tokens = args.num_summary_tokens
+        self.window_size = args.window_size
+        self._input_resolution = None
+        self._num_windows = None
+        self._cls_patch = None
+        self._order_cache = dict()
+        embedder.register_forward_pre_hook(self._enter_model)
+        # This will decide if we window-fy the patches
+        # and enable vit-det for this iteration, and if so,
+        # rearrange the patches for efficient mode switching
+        blocks.register_forward_pre_hook(self._enter_blocks)
+        is_global = True
+        period = args.num_windowed + 1
+        for i, layer in enumerate(blocks[:-1]):
+            ctr = i % period
+            if ctr == 0:
+                layer.register_forward_pre_hook(self._to_windows)
+                is_global = False
+            elif ctr == args.num_windowed:
+                layer.register_forward_pre_hook(self._to_global)
+                is_global = True
+        # Always ensure the final layer is a global layer
+        if not is_global:
+            blocks[-1].register_forward_pre_hook(self._to_global)
+        blocks.register_forward_hook(self._exit_model)
+    def _enter_model(self, _, input: List[torch.Tensor]):
+        self._input_resolution = input[0].shape[-2:]
+    def _enter_blocks(self, _, input: List[torch.Tensor]):
+        # print(f'{get_rank()} - ViTDet Window Size: {self._window_size}', file=sys.stderr)
+        patches = input[0]
+        patches = self._rearrange_patches(patches)
+        return (patches,) + input[1:]
+    def _to_windows(self, _, input: List[torch.Tensor]):
+        patches = input[0]
+        if self.num_summary_tokens:
+            self._cls_patch = patches[:, :self.num_summary_tokens]
+            patches = patches[:, self.num_summary_tokens:]
+        patches = rearrange(
+            patches, 'b (p t) c -> (b p) t c',
+            p=self._num_windows, t=self.window_size ** 2,
+        )
+        return (patches,) + input[1:]
+    def _to_global(self, _, input: List[torch.Tensor]):
+        patches = input[0]
+        patches = rearrange(
+            patches, '(b p) t c -> b (p t) c',
+            p=self._num_windows, t=self.window_size ** 2,
+            b=patches.shape[0] // self._num_windows,
+        )
+        if self.num_summary_tokens:
+            patches = torch.cat([
+                self._cls_patch,
+                patches,
+            ], dim=1)
+        return (patches,) + input[1:]
+    def _exit_model(self, _, inputs: List[torch.Tensor], patches: torch.Tensor):
+        # Return patches to their original order
+        patch_order = self._order_cache[self._input_resolution][0]
+        patch_order = patch_order.reshape(1, -1, 1).expand_as(patches)
+        ret_patches = torch.empty_like(patches)
+        ret_patches = torch.scatter(
+            ret_patches,
+            dim=1,
+            index=patch_order,
+            src=patches,
+        )
+        return ret_patches
+    def _rearrange_patches(self, patches: torch.Tensor):
+        # We rearrange the patches so that we can efficiently
+        # switch between windowed and global mode by just
+        # reshaping the tensor
+        patch_order, self._num_windows = self._order_cache.get(self._input_resolution, (None, None))
+        if patch_order is None:
+            num_feat_patches = patches.shape[1] - self.num_summary_tokens
+            num_pixels = self._input_resolution[0] * self._input_resolution[1]
+            patch_size = int(round(math.sqrt(num_pixels / num_feat_patches)))
+            rows = self._input_resolution[-2] // patch_size
+            cols = self._input_resolution[-1] // patch_size
+            w_rows = rows // self.window_size
+            w_cols = cols // self.window_size
+            patch_order = torch.arange(0, num_feat_patches, device=patches.device)
+            patch_order = rearrange(
+                patch_order, '(wy py wx px) -> (wy wx py px)',
+                wy=w_rows, wx=w_cols,
+                py=self.window_size, px=self.window_size,
+            )
+            if self.num_summary_tokens:
+                patch_order = torch.cat([
+                    torch.arange(self.num_summary_tokens, dtype=patch_order.dtype, device=patch_order.device),
+                    patch_order + self.num_summary_tokens,
+                ])
+            self._num_windows = w_rows * w_cols
+            self._order_cache[self._input_resolution] = (
+                patch_order,
+                self._num_windows,
+            )
+        patch_order = patch_order.reshape(1, -1, 1).expand_as(patches)
+        patches = torch.gather(patches, dim=1, index=patch_order)
+        return patches