Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +26 -0
configuration_aimv2.py +62 -0
model.safetensors +3 -0
model.safetensors.index.json +180 -0
modeling_aimv2.py +191 -0
preprocessor_config.json +27 -0

config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "_name_or_path": "checkpoints/aimv2-3B-patch14-448",
+  "architectures": [
+    "AIMv2Model"
+  ],
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_aimv2.AIMv2Config",
+    "AutoModel": "modeling_aimv2.AIMv2Model",
+    "FlaxAutoModel": "modeling_flax_aimv2.FlaxAIMv2Model"
+  },
+  "hidden_size": 3072,
+  "image_size": 448,
+  "intermediate_size": 8192,
+  "model_type": "aimv2",
+  "num_attention_heads": 24,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dropout": 0.0,
+  "qkv_bias": false,
+  "rms_norm_eps": 1e-05,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.2",
+  "use_bias": false
+}

configuration_aimv2.py ADDED Viewed

	@@ -0,0 +1,62 @@

+from typing import Any
+from transformers.configuration_utils import PretrainedConfig
+__all__ = ["AIMv2Config"]
+class AIMv2Config(PretrainedConfig):
+    """This is the configuration class to store the configuration of an [`AIMv2Model`].
+    Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the [apple/aimv2-large-patch14-224](https://huggingface.co/apple/aimv2-large-patch14-224).
+    Args:
+        hidden_size: Dimension of the hidden representations.
+        intermediate_size: Dimension of the SwiGLU representations.
+        num_hidden_layers: Number of hidden layers in the Transformer.
+        num_attention_heads: Number of attention heads for each attention layer
+            in the Transformer.
+        num_channels: Number of input channels.
+        image_size: Image size.
+        patch_size: Patch size.
+        rms_norm_eps: Epsilon value used for the RMS normalization layer.
+        attention_dropout: Dropout ratio for attention probabilities.
+        projection_dropout: Dropout ratio for the projection layer after the attention.
+        qkv_bias: Whether to add a bias to the queries, keys and values.
+        use_bias: Whether to add a bias in the feed-forward and projection layers.
+        kwargs: Keyword arguments for the [`PretrainedConfig`].
+    """
+    model_type: str = "aimv2"
+    def __init__(
+        self,
+        hidden_size: int = 1024,
+        intermediate_size: int = 2816,
+        num_hidden_layers: int = 24,
+        num_attention_heads: int = 8,
+        num_channels: int = 3,
+        image_size: int = 224,
+        patch_size: int = 14,
+        rms_norm_eps: float = 1e-5,
+        attention_dropout: float = 0.0,
+        projection_dropout: float = 0.0,
+        qkv_bias: bool = False,
+        use_bias: bool = False,
+        **kwargs: Any,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.rms_norm_eps = rms_norm_eps
+        self.projection_dropout = projection_dropout
+        self.qkv_bias = qkv_bias
+        self.use_bias = use_bias

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f445eaeab8c48ae50ab0de0157b47747b484064ed73e770e64db09eabc93927a
+size 5446053960

model.safetensors.index.json ADDED Viewed

	@@ -0,0 +1,180 @@

+{
+  "metadata": {
+    "total_size": 5446035456
+  },
+  "weight_map": {
+    "preprocessor.patchifier.norm.weight": "model-00001-of-00002.safetensors",
+    "preprocessor.patchifier.proj.bias": "model-00001-of-00002.safetensors",
+    "preprocessor.patchifier.proj.weight": "model-00001-of-00002.safetensors",
+    "preprocessor.pos_embed": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.0.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.1.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.10.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.11.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.12.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.13.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.14.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.15.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.16.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.17.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.18.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.19.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.2.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.20.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.21.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.22.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.norm_1.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.22.norm_2.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.attn.proj.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.attn.qkv.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.mlp.fc1.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.mlp.fc2.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.mlp.fc3.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.norm_1.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.23.norm_2.weight": "model-00002-of-00002.safetensors",
+    "trunk.blocks.3.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.3.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.4.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.5.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.6.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.7.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.8.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.attn.proj.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.attn.qkv.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.mlp.fc3.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.norm_1.weight": "model-00001-of-00002.safetensors",
+    "trunk.blocks.9.norm_2.weight": "model-00001-of-00002.safetensors",
+    "trunk.post_trunk_norm.weight": "model-00002-of-00002.safetensors"
+  }
+}

modeling_aimv2.py ADDED Viewed

	@@ -0,0 +1,191 @@

+from typing import Optional, Tuple, Union
+import torch
+from .configuration_aimv2 import AIMv2Config
+from torch import nn
+from torch.nn import functional as F
+from transformers.modeling_outputs import BaseModelOutputWithNoAttention
+from transformers.modeling_utils import PreTrainedModel
+__all__ = ["AIMv2Model"]
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+    def extra_repr(self) -> str:
+        return f"{tuple(self.weight.shape)}, eps={self.eps}"
+    def _norm(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+class AIMv2SwiGLUFFN(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        hidden_features = config.intermediate_size
+        in_features = config.hidden_size
+        bias = config.use_bias
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.fc2 = nn.Linear(hidden_features, in_features, bias=bias)
+        self.fc3 = nn.Linear(in_features, hidden_features, bias=bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = F.silu(self.fc1(x)) * self.fc3(x)
+        x = self.fc2(x)
+        return x
+class AIMv2PatchEmbed(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.proj = nn.Conv2d(
+            config.num_channels,
+            config.hidden_size,
+            kernel_size=(config.patch_size, config.patch_size),
+            stride=(config.patch_size, config.patch_size),
+        )
+        self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        x = self.norm(x)
+        return x
+class AIMv2ViTPreprocessor(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        num_patches = (config.image_size // config.patch_size) ** 2
+        self.patchifier = AIMv2PatchEmbed(config)
+        self.pos_embed = nn.Parameter(torch.zeros((1, num_patches, config.hidden_size)))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        tokens = self.patchifier(x)
+        _, N, _ = tokens.shape
+        pos_embed = self.pos_embed.to(tokens.device)
+        tokens = tokens + pos_embed[:, :N]
+        return tokens
+class AIMv2Attention(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attention_dropout)
+        self.proj = nn.Linear(dim, dim, bias=config.use_bias)
+        self.proj_drop = nn.Dropout(config.projection_dropout)
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, C // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        x = F.scaled_dot_product_attention(q, k, v, attn_mask=mask)
+        x = x.transpose(1, 2).contiguous().reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class AIMv2Block(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.attn = AIMv2Attention(config)
+        self.norm_1 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.mlp = AIMv2SwiGLUFFN(config)
+        self.norm_2 = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self, x: torch.Tensor, mask: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        x = x + self.attn(self.norm_1(x), mask)
+        x = x + self.mlp(self.norm_2(x))
+        return x
+class AIMv2Transformer(nn.Module):
+    def __init__(self, config: AIMv2Config):
+        super().__init__()
+        self.blocks = nn.ModuleList(
+            [AIMv2Block(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.post_trunk_norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        output_hidden_states: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[Tuple[torch.Tensor, ...]]]:
+        hidden_states = () if output_hidden_states else None
+        for block in self.blocks:
+            tokens = block(tokens, mask)
+            if output_hidden_states:
+                hidden_states += (tokens,)
+        tokens = self.post_trunk_norm(tokens)
+        return tokens, hidden_states
+class AIMv2PretrainedModel(PreTrainedModel):
+    config_class = AIMv2Config
+    base_model_prefix = "aimv2"
+    main_input_name = "pixel_values"
+    _supports_sdpa = True
+class AIMv2Model(AIMv2PretrainedModel):
+    def __init__(self, config: AIMv2Config):
+        super().__init__(config)
+        self.preprocessor = AIMv2ViTPreprocessor(config)
+        self.trunk = AIMv2Transformer(config)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[
+        Tuple[torch.Tensor],
+        Tuple[torch.Tensor, Tuple[torch.Tensor, ...]],
+        BaseModelOutputWithNoAttention,
+    ]:
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+        if return_dict is None:
+            return_dict = self.config.use_return_dict
+        x = self.preprocessor(pixel_values)
+        x, hidden_states = self.trunk(
+            x, mask, output_hidden_states=output_hidden_states
+        )
+        if not return_dict:
+            res = (x,)
+            res += (hidden_states,) if output_hidden_states else ()
+            return res
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=x,
+            hidden_states=hidden_states,
+        )

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,27 @@

+{
+  "crop_size": {
+    "height": 448,
+    "width": 448
+  },
+  "do_center_crop": true,
+  "do_convert_rgb": true,
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.48145466,
+    0.4578275,
+    0.40821073
+  ],
+  "image_processor_type": "CLIPImageProcessor",
+  "image_std": [
+    0.26862954,
+    0.26130258,
+    0.27577711
+  ],
+  "resample": 3,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "shortest_edge": 448
+  }
+}