Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitattributes +1 -34
README.md +32 -0
__init__.py +4 -0
__pycache__/configuration_f2p_decoder.cpython-312.pyc +0 -0
__pycache__/decoder.cpython-312.pyc +0 -0
__pycache__/modeling_f2p_decoder.cpython-312.pyc +0 -0
config.json +40 -0
configuration_f2p_decoder.py +68 -0
convert_original_checkpoint.py +42 -0
decoder.py +1149 -0
model.safetensors +3 -0
modeling_f2p_decoder.py +93 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,2 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text


























1	*.safetensors filter=lfs diff=lfs merge=lfs -text
2	+ *.pt filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,32 @@

+---
+library_name: transformers
+tags:
+- vision
+- image-reconstruction
+- siglip2
+- safetensors
+---
+# F2P Decoder
+Hugging Face `AutoModel` wrapper for the SigLIP2 feature-to-pixel decoder used in this repository.
+```python
+import torch
+from transformers import AutoModel
+model = AutoModel.from_pretrained(
+    "your-namespace/f2p_decoder",
+    trust_remote_code=True,
+).eval()
+features = torch.randn(1, 257, 1152)
+reconstruction = model(features)
+print(reconstruction.shape)  # (1, 3, 224, 224)
+```
+The model expects SigLIP2 patch features with a CLS token, for example from
+`google/siglip2-so400m-patch14-224`. The output is an image tensor in the
+decoder's reconstructed pixel space.
+Source `.pt` checkpoint: `nyu-visionx/siglip2_decoder/model.pt`.

__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .configuration_f2p_decoder import F2PDecoderConfig
+from .modeling_f2p_decoder import F2PDecoderModel
+__all__ = ["F2PDecoderConfig", "F2PDecoderModel"]

__pycache__/configuration_f2p_decoder.cpython-312.pyc ADDED Viewed

Binary file (3.53 kB). View file

__pycache__/decoder.cpython-312.pyc ADDED Viewed

Binary file (53.7 kB). View file

__pycache__/modeling_f2p_decoder.cpython-312.pyc ADDED Viewed

Binary file (4.62 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "architectures": [
+    "F2PDecoderModel"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "auto_map": {
+    "AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig",
+    "AutoModel": "modeling_f2p_decoder.F2PDecoderModel"
+  },
+  "decoder_hidden_size": 1152,
+  "decoder_intermediate_size": 4096,
+  "decoder_num_attention_heads": 16,
+  "decoder_num_hidden_layers": 28,
+  "drop_cls_token": true,
+  "dtype": "float32",
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1152,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_size": 224,
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-12,
+  "model_type": "f2p_decoder",
+  "num_channels": 3,
+  "num_patches": 256,
+  "patch_size": 14,
+  "pretrained_encoder_name": "google/siglip2-so400m-patch14-224",
+  "qkv_bias": true,
+  "source_decoder_repo": "nyu-visionx/siglip2_decoder",
+  "transformers_version": "4.57.6"
+}

configuration_f2p_decoder.py ADDED Viewed

	@@ -0,0 +1,68 @@

+from transformers import PretrainedConfig
+class F2PDecoderConfig(PretrainedConfig):
+    """Configuration for a feature-to-pixel reconstruction decoder."""
+    model_type = "f2p_decoder"
+    def __init__(
+        self,
+        pretrained_encoder_name: str = "google/siglip2-so400m-patch14-224",
+        source_decoder_repo: str = "nyu-visionx/siglip2_decoder",
+        image_size: int = 224,
+        patch_size: int = 14,
+        num_channels: int = 3,
+        hidden_size: int = 1152,
+        decoder_hidden_size: int = 1152,
+        decoder_num_hidden_layers: int = 28,
+        decoder_num_attention_heads: int = 16,
+        decoder_intermediate_size: int = 4096,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.0,
+        attention_probs_dropout_prob: float = 0.0,
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        qkv_bias: bool = True,
+        num_patches: int = 256,
+        drop_cls_token: bool = True,
+        image_mean: list[float] | None = None,
+        image_std: list[float] | None = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        if getattr(self, "auto_map", None) is None:
+            self.auto_map = {
+                "AutoConfig": "configuration_f2p_decoder.F2PDecoderConfig",
+                "AutoModel": "modeling_f2p_decoder.F2PDecoderModel",
+            }
+        if image_mean is None:
+            image_mean = [0.5, 0.5, 0.5]
+        if image_std is None:
+            image_std = [0.5, 0.5, 0.5]
+        if len(image_mean) != num_channels or len(image_std) != num_channels:
+            raise ValueError("image_mean and image_std must match num_channels.")
+        if not drop_cls_token:
+            raise ValueError("Only drop_cls_token=True is supported by this decoder.")
+        self.pretrained_encoder_name = pretrained_encoder_name
+        self.source_decoder_repo = source_decoder_repo
+        self.image_size = int(image_size)
+        self.patch_size = int(patch_size)
+        self.num_channels = int(num_channels)
+        self.hidden_size = int(hidden_size)
+        self.decoder_hidden_size = int(decoder_hidden_size)
+        self.decoder_num_hidden_layers = int(decoder_num_hidden_layers)
+        self.decoder_num_attention_heads = int(decoder_num_attention_heads)
+        self.decoder_intermediate_size = int(decoder_intermediate_size)
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = float(hidden_dropout_prob)
+        self.attention_probs_dropout_prob = float(attention_probs_dropout_prob)
+        self.initializer_range = float(initializer_range)
+        self.layer_norm_eps = float(layer_norm_eps)
+        self.qkv_bias = bool(qkv_bias)
+        self.num_patches = int(num_patches)
+        self.drop_cls_token = bool(drop_cls_token)
+        self.image_mean = [float(value) for value in image_mean]
+        self.image_std = [float(value) for value in image_std]

convert_original_checkpoint.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import argparse
+from pathlib import Path
+import torch
+from huggingface_hub import hf_hub_download
+from configuration_f2p_decoder import F2PDecoderConfig
+from modeling_f2p_decoder import F2PDecoderModel
+def convert(output_dir: str) -> None:
+    output_path = Path(output_dir)
+    checkpoint_path = hf_hub_download("nyu-visionx/siglip2_decoder", "model.pt")
+    state_dict = torch.load(checkpoint_path, map_location="cpu")
+    state_dict = {f"decoder.{key}": value for key, value in state_dict.items()}
+    config = F2PDecoderConfig()
+    model = F2PDecoderModel(config)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    unexpected_keys = [key for key in unexpected_keys if key]
+    missing_keys = [
+        key for key in missing_keys if key not in {"image_mean", "image_std"}
+    ]
+    if missing_keys or unexpected_keys:
+        raise RuntimeError(
+            "Checkpoint conversion mismatch: "
+            f"missing={missing_keys}, unexpected={unexpected_keys}"
+        )
+    model.save_pretrained(output_path, safe_serialization=True)
+    print(f"Saved Hugging Face artifact to {output_path}")
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--output_dir", default="hf_artifacts/f2p_decoder")
+    args = parser.parse_args()
+    convert(args.output_dir)
+if __name__ == "__main__":
+    main()

decoder.py ADDED Viewed

	@@ -0,0 +1,1149 @@

+# coding=utf-8
+# Copyright 2022 Facebook AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT MAE (masked autoencoder) model."""
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Set, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+# correct the above import to the following
+from transformers.models.vit_mae.configuration_vit_mae import ViTMAEConfig
+from transformers.utils import (
+    ModelOutput,
+    add_start_docstrings,
+    logging,
+    replace_return_docstrings,
+    add_start_docstrings_to_model_forward,
+)
+from transformers.pytorch_utils import (
+    find_pruneable_heads_and_indices,
+    prune_linear_layer,
+)
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.modeling_utils import PreTrainedModel
+logger = logging.get_logger(__name__)
+_CONFIG_FOR_DOC = "ViTMAEConfig"
+_CHECKPOINT_FOR_DOC = "facebook/vit-mae-base"
+@dataclass
+class ViTMAEModelOutput(ModelOutput):
+    """
+    Class for ViTMAEModel's outputs, with potential hidden states and attentions.
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    last_hidden_state: torch.FloatTensor = None
+    mask: torch.LongTensor = None
+    ids_restore: torch.LongTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class ViTMAEDecoderOutput(ModelOutput):
+    """
+    Class for ViTMAEDecoder's outputs, with potential hidden states and attentions.
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+@dataclass
+class ViTMAEForPreTrainingOutput(ModelOutput):
+    """
+    Class for ViTMAEForPreTraining's outputs, with potential hidden states and attentions.
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Tensor indicating which patches are masked (1) and which are not (0).
+        ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Tensor containing the original index of the (shuffled) masked patches.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    mask: torch.LongTensor = None
+    ids_restore: torch.LongTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+def get_2d_sincos_pos_embed(embed_dim, grid_size, add_cls_token=False):
+    """
+    Create 2D sin/cos positional embeddings.
+    Args:
+        embed_dim (`int`):
+            Embedding dimension.
+        grid_size (`int`):
+            The grid height and width.
+        add_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether or not to add a classification (CLS) token.
+    Returns:
+        (`torch.FloatTensor` of shape (grid_size*grid_size, embed_dim) or (1+grid_size*grid_size, embed_dim): the
+        position embeddings (with or without classification token)
+    """
+    grid_h = np.arange(grid_size, dtype=np.float32)
+    grid_w = np.arange(grid_size, dtype=np.float32)
+    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
+    grid = np.stack(grid, axis=0)
+    grid = grid.reshape([2, 1, grid_size, grid_size])
+    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
+    if add_cls_token:
+        pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], axis=0)
+    return pos_embed
+def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+    # use half of dimensions to encode grid_h
+    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
+    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)
+    emb = np.concatenate([emb_h, emb_w], axis=1)  # (H*W, D)
+    return emb
+def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
+    """
+    embed_dim: output dimension for each position pos: a list of positions to be encoded: size (M,) out: (M, D)
+    """
+    if embed_dim % 2 != 0:
+        raise ValueError("embed_dim must be even")
+    omega = np.arange(embed_dim // 2, dtype=float)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    pos = pos.reshape(-1)  # (M,)
+    out = np.einsum("m,d->md", pos, omega)  # (M, D/2), outer product
+    emb_sin = np.sin(out)  # (M, D/2)
+    emb_cos = np.cos(out)  # (M, D/2)
+    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
+    return emb
+class ViTMAEEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.patch_embeddings = ViTMAEPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+        # fixed sin-cos embedding
+        self.position_embeddings = nn.Parameter(
+            torch.zeros(1, self.num_patches + 1, config.hidden_size),
+            requires_grad=False,
+        )
+        self.config = config
+        self.initialize_weights()
+    def initialize_weights(self):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        pos_embed = get_2d_sincos_pos_embed(
+            self.position_embeddings.shape[-1],
+            int(self.patch_embeddings.num_patches**0.5),
+            add_cls_token=True,
+        )
+        self.position_embeddings.data.copy_(
+            torch.from_numpy(pos_embed).float().unsqueeze(0)
+        )
+        # initialize patch_embeddings like nn.Linear (instead of nn.Conv2d)
+        w = self.patch_embeddings.projection.weight.data
+        torch.nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        torch.nn.init.normal_(self.cls_token, std=self.config.initializer_range)
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+        if num_patches == num_positions and height == width:
+            return self.position_embeddings
+        class_pos_embed = self.position_embeddings[:, 0, :]
+        patch_pos_embed = self.position_embeddings[:, 1:, :]
+        dim = embeddings.shape[-1]
+        h0 = height // self.config.patch_size
+        w0 = width // self.config.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        h0, w0 = h0 + 0.1, w0 + 0.1
+        patch_pos_embed = patch_pos_embed.reshape(
+            1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
+        )
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(h0 / math.sqrt(num_positions), w0 / math.sqrt(num_positions)),
+            mode="bicubic",
+            align_corners=False,
+        )
+        if int(h0) != patch_pos_embed.shape[-2] or int(w0) != patch_pos_embed.shape[-1]:
+            raise ValueError(
+                "Width or height does not match with the interpolated position embeddings"
+            )
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def random_masking(self, sequence, noise=None):
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+        Args:
+            sequence (`torch.LongTensor` of shape `(batch_size, sequence_length, dim)`)
+            noise (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size, seq_length, dim = sequence.shape
+        len_keep = int(seq_length * (1 - self.config.mask_ratio))
+        if noise is None:
+            noise = torch.rand(
+                batch_size, seq_length, device=sequence.device
+            )  # noise in [0, 1]
+        # sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1).to(
+            sequence.device
+        )  # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(sequence.device)
+        # keep the first subset
+        ids_keep = ids_shuffle[:, :len_keep]
+        sequence_unmasked = torch.gather(
+            sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, dim)
+        )
+        # generate the binary mask: 0 is keep, 1 is remove
+        mask = torch.ones([batch_size, seq_length], device=sequence.device)
+        mask[:, :len_keep] = 0
+        # unshuffle to get the binary mask
+        mask = torch.gather(mask, dim=1, index=ids_restore)
+        return sequence_unmasked, mask, ids_restore
+    def forward(self, pixel_values, noise=None, interpolate_pos_encoding: bool = False):
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        if interpolate_pos_encoding:
+            position_embeddings = self.interpolate_pos_encoding(
+                embeddings, height, width
+            )
+        else:
+            position_embeddings = self.position_embeddings
+        # add position embeddings w/o cls token
+        embeddings = embeddings + position_embeddings[:, 1:, :]
+        # masking: length -> length * config.mask_ratio
+        embeddings, mask, ids_restore = self.random_masking(embeddings, noise)
+        # append cls token
+        cls_token = self.cls_token + position_embeddings[:, :1, :]
+        cls_tokens = cls_token.expand(embeddings.shape[0], -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        return embeddings, mask, ids_restore
+class ViTMAEPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+        image_size = (
+            image_size
+            if isinstance(image_size, collections.abc.Iterable)
+            else (image_size, image_size)
+        )
+        patch_size = (
+            patch_size
+            if isinstance(patch_size, collections.abc.Iterable)
+            else (patch_size, patch_size)
+        )
+        num_patches = (image_size[1] // patch_size[1]) * (
+            image_size[0] // patch_size[0]
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.projection = nn.Conv2d(
+            num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
+        )
+    def forward(self, pixel_values, interpolate_pos_encoding: bool = False):
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding and (
+            height != self.image_size[0] or width != self.image_size[1]
+        ):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+        x = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return x
+# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention ViT->ViTMAE
+class ViTMAESelfAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
+            config, "embedding_size"
+        ):
+            raise ValueError(
+                f"The hidden size {(config.hidden_size,)} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.query = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.key = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.value = nn.Linear(
+            config.hidden_size, self.all_head_size, bias=config.qkv_bias
+        )
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+        context_layer = torch.matmul(attention_probs, value_layer)
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        outputs = (
+            (context_layer, attention_probs) if output_attentions else (context_layer,)
+        )
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaSelfAttention ViT->ViTMAE
+class ViTMAESdpaSelfAttention(ViTMAESelfAttention):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+    def forward(
+        self,
+        hidden_states,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+        return context_layer, None
+# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->ViTMAE
+class ViTMAESelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTMAELayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->ViTMAE
+class ViTMAEAttention(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.attention = ViTMAESelfAttention(config)
+        self.output = ViTMAESelfOutput(config)
+        self.pruned_heads = set()
+    def prune_heads(self, heads: Set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads,
+            self.attention.num_attention_heads,
+            self.attention.attention_head_size,
+            self.pruned_heads,
+        )
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(
+            heads
+        )
+        self.attention.all_head_size = (
+            self.attention.attention_head_size * self.attention.num_attention_heads
+        )
+        self.pruned_heads = self.pruned_heads.union(heads)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[
+            1:
+        ]  # add attentions if we output them
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->ViTMAE
+class ViTMAESdpaAttention(ViTMAEAttention):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__(config)
+        self.attention = ViTMAESdpaSelfAttention(config)
+# Copied from transformers.models.vit.modeling_vit.ViTIntermediate ViT->ViTMAE
+class ViTMAEIntermediate(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+# Copied from transformers.models.vit.modeling_vit.ViTOutput ViT->ViTMAE
+class ViTMAEOutput(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+    def forward(
+        self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
+    ) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states + input_tensor
+        return hidden_states
+VITMAE_ATTENTION_CLASSES = {
+    "eager": ViTMAEAttention,
+    "sdpa": ViTMAESdpaAttention,
+}
+# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->ViTMAE,VIT->VITMAE
+class ViTMAELayer(nn.Module):
+    """This corresponds to the Block class in the timm implementation."""
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VITMAE_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.intermediate = ViTMAEIntermediate(config)
+        self.output = ViTMAEOutput(config)
+        self.layernorm_before = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+        self.layernorm_after = nn.LayerNorm(
+            config.hidden_size, eps=config.layer_norm_eps
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(
+                hidden_states
+            ),  # in ViTMAE, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[
+            1:
+        ]  # add self attentions if we output attention weights
+        # first residual connection
+        hidden_states = attention_output + hidden_states
+        # in ViTMAE, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+        outputs = (layer_output,) + outputs
+        return outputs
+# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->ViTMAE
+class ViTMAEEncoder(nn.Module):
+    def __init__(self, config: ViTMAEConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList(
+            [ViTMAELayer(config) for _ in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    layer_head_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, layer_head_mask, output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+class ViTMAEPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = ViTMAEConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+VIT_MAE_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+    Parameters:
+        config ([`ViTMAEConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+VIT_MAE_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        interpolate_pos_encoding (`bool`, *optional*, default `False`):
+            Whether to interpolate the pre-trained position encodings. This is mainly used to use the model on higher
+            resolution images.
+"""
+@add_start_docstrings(
+    "The bare ViTMAE Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_MAE_START_DOCSTRING,
+)
+class ViTMAEModel(ViTMAEPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = ViTMAEEmbeddings(config)
+        self.encoder = ViTMAEEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+    @add_start_docstrings_to_model_forward(VIT_MAE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=ViTMAEModelOutput, config_class=_CONFIG_FOR_DOC
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> Union[Tuple, ViTMAEModelOutput]:
+        r"""
+        Returns:
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, ViTMAEModel
+        >>> from PIL import Image
+        >>> import requests
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/vit-mae-base")
+        >>> model = ViTMAEModel.from_pretrained("facebook/vit-mae-base")
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+        embedding_output, mask, ids_restore = self.embeddings(
+            pixel_values, noise=noise, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        if not return_dict:
+            return (sequence_output, mask, ids_restore) + encoder_outputs[1:]
+        return ViTMAEModelOutput(
+            last_hidden_state=sequence_output,
+            mask=mask,
+            ids_restore=ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+class GeneralDecoder(nn.Module):
+    def __init__(self, config, num_patches):
+        super().__init__()
+        self.decoder_embed = nn.Linear(
+            config.hidden_size, config.decoder_hidden_size, bias=True
+        )
+        self.decoder_pos_embed = nn.Parameter(
+            torch.zeros(1, num_patches + 1, config.decoder_hidden_size),
+            requires_grad=False,
+        )  # fixed sin-cos embedding
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [
+                ViTMAELayer(decoder_config)
+                for _ in range(config.decoder_num_hidden_layers)
+            ]
+        )
+        self.decoder_norm = nn.LayerNorm(
+            config.decoder_hidden_size, eps=config.layer_norm_eps
+        )
+        self.decoder_pred = nn.Linear(
+            config.decoder_hidden_size,
+            config.patch_size**2 * config.num_channels,
+            bias=True,
+        )  # encoder to decoder
+        self.gradient_checkpointing = False
+        self.config = config
+        self.num_patches = num_patches
+        self.initialize_weights(num_patches)
+        self.decoder_config = decoder_config
+        self.set_trainable_cls_token()
+    def set_trainable_cls_token(self, tensor: Optional[torch.Tensor] = None):
+        # register a trainable CLS token
+        tensor = (
+            torch.zeros(1, 1, self.decoder_config.hidden_size)
+            if tensor is None
+            else tensor
+        )
+        self.trainable_cls_token = nn.Parameter(tensor)
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor) -> torch.Tensor:
+        """
+        This method is a modified version of the interpolation function for ViT-mae model at the deocder, that
+        allows to interpolate the pre-trained decoder position encodings, to be able to use the model on higher
+        resolution images.
+        Source:
+        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
+        """
+        # -1 removes the class dimension since we later append it without interpolation
+        embeddings_positions = embeddings.shape[1] - 1
+        num_positions = self.decoder_pos_embed.shape[1] - 1
+        # Separation of class token and patch tokens
+        class_pos_embed = self.decoder_pos_embed[:, 0, :]
+        patch_pos_embed = self.decoder_pos_embed[:, 1:, :]
+        # To retain the final 3d tensor with the required dimensions
+        dim = self.decoder_pos_embed.shape[-1]
+        # Increasing a dimension to enable bicubic interpolation
+        patch_pos_embed = patch_pos_embed.reshape(1, 1, -1, dim)
+        # permute to bring the dimension to be interpolated, to the last
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+        # Interpolating the decoder position embeddings shape wrt embeddings shape i.e (x).
+        # 1 keeps the other dimension constant
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            scale_factor=(1, embeddings_positions / num_positions),
+            mode="bicubic",
+            align_corners=False,
+        )
+        # Converting back to the original shape
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        # Adding the class token back
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+    def interpolate_latent(self, x: torch.Tensor) -> torch.Tensor:
+        b, l, c = x.shape
+        if l == self.num_patches:
+            return x
+        # interpolate the latent
+        # print(f"interpolating latent from {l} to {self.num_patches}, x.shape = {x.shape}")
+        h, w = int(l**0.5), int(l**0.5)
+        x = x.reshape(b, h, w, c)
+        x = x.permute(0, 3, 1, 2)
+        target_size = (int(self.num_patches**0.5), int(self.num_patches**0.5))
+        x = nn.functional.interpolate(
+            x, size=target_size, mode="bilinear", align_corners=False
+        )
+        x = x.permute(0, 2, 3, 1).contiguous().view(b, self.num_patches, c)
+        return x
+    def initialize_weights(self, num_patches):
+        # initialize (and freeze) position embeddings by sin-cos embedding
+        decoder_pos_embed = get_2d_sincos_pos_embed(
+            self.decoder_pos_embed.shape[-1], int(num_patches**0.5), add_cls_token=True
+        )
+        self.decoder_pos_embed.data.copy_(
+            torch.from_numpy(decoder_pos_embed).float().unsqueeze(0)
+        )
+        # timm's trunc_normal_(std=.02) is effectively normal_(std=0.02) as cutoff is too big (2.)
+        # torch.nn.init.normal_(self.mask_token, std=self.config.initializer_range)
+    def unpatchify(
+        self,
+        patchified_pixel_values,
+        original_image_size: Optional[Tuple[int, int]] = None,
+    ):
+        """
+        Args:
+            patchified_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_patches, patch_size**2 * num_channels)`:
+                Patchified pixel values.
+            original_image_size (`Tuple[int, int]`, *optional*):
+                Original image size.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`:
+                Pixel values.
+        """
+        patch_size, num_channels = self.config.patch_size, self.config.num_channels
+        original_image_size = (
+            original_image_size
+            if original_image_size is not None
+            else (self.config.image_size, self.config.image_size)
+        )
+        original_height, original_width = original_image_size
+        num_patches_h = original_height // patch_size
+        num_patches_w = original_width // patch_size
+        # sanity check
+        if num_patches_h * num_patches_w != patchified_pixel_values.shape[1]:
+            raise ValueError(
+                f"The number of patches in the patchified pixel values {patchified_pixel_values.shape[1]}, does not match the number of patches on original image {num_patches_h}*{num_patches_w}"
+            )
+        # unpatchify
+        batch_size = patchified_pixel_values.shape[0]
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_patches_h,
+            num_patches_w,
+            patch_size,
+            patch_size,
+            num_channels,
+        )
+        patchified_pixel_values = torch.einsum(
+            "nhwpqc->nchpwq", patchified_pixel_values
+        )
+        pixel_values = patchified_pixel_values.reshape(
+            batch_size,
+            num_channels,
+            num_patches_h * patch_size,
+            num_patches_w * patch_size,
+        )
+        return pixel_values
+    def forward(
+        self,
+        hidden_states,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        interpolate_pos_encoding: bool = False,
+        drop_cls_token: bool = False,
+    ):
+        # embed tokens
+        x = self.decoder_embed(hidden_states)
+        # print(f"x.shape = {x.shape}")
+        # append mask tokens to sequence
+        # mask_tokens = self.mask_token.repeat(x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1)
+        # append mask tokens to sequence
+        # x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1)  # no cls token
+        x_ = x[:, 1:, :]  # no cls token
+        if drop_cls_token:
+            cls_token = self.trainable_cls_token.expand(x_.shape[0], -1, -1)
+            # print(f"cls_token.shape = {cls_token.shape}, x_.shape = {x_.shape}")
+            x_ = self.interpolate_latent(x_)
+            x = torch.cat([cls_token, x_], dim=1)
+        else:
+            raise NotImplementedError("drop_cls_token is not implemented")
+            x = self.interpolate_latent(x)  # interpolate the whole latent
+            x = torch.cat([x[:, :1, :], x_], dim=1)  # append cls token
+        # add pos embed
+        if interpolate_pos_encoding:
+            decoder_pos_embed = self.interpolate_pos_encoding(x)
+        else:
+            decoder_pos_embed = self.decoder_pos_embed
+        hidden_states = x + decoder_pos_embed
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    None,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(
+                    hidden_states, head_mask=None, output_attentions=output_attentions
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        hidden_states = self.decoder_norm(hidden_states)
+        # predictor projection
+        logits = self.decoder_pred(hidden_states)
+        # remove cls token
+        logits = logits[:, 1:, :]
+        if not return_dict:
+            return tuple(
+                v
+                for v in [logits, all_hidden_states, all_self_attentions]
+                if v is not None
+            )
+        return ViTMAEDecoderOutput(
+            logits=logits,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0760e38a1d417bef3dc9916d9add8f50c8a0e3ce06b56c84d8319fabfdc466cc
+size 1662407016

modeling_f2p_decoder.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+from torch import nn
+from transformers.modeling_outputs import ModelOutput
+from transformers.modeling_utils import PreTrainedModel
+try:
+    from .configuration_f2p_decoder import F2PDecoderConfig
+    from .decoder import GeneralDecoder
+except ImportError:
+    from configuration_f2p_decoder import F2PDecoderConfig
+    from decoder import GeneralDecoder
+@dataclass
+class F2PDecoderOutput(ModelOutput):
+    reconstruction: torch.FloatTensor = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[Tuple[torch.FloatTensor, ...]] = None
+class F2PDecoderModel(PreTrainedModel):
+    """Feature-to-pixel decoder for SigLIP2 patch features."""
+    config_class = F2PDecoderConfig
+    base_model_prefix = "f2p_decoder"
+    main_input_name = "hidden_states"
+    supports_gradient_checkpointing = True
+    def __init__(self, config: F2PDecoderConfig):
+        super().__init__(config)
+        image_mean = torch.tensor(config.image_mean, dtype=torch.float32).view(
+            1, config.num_channels, 1, 1
+        )
+        image_std = torch.tensor(config.image_std, dtype=torch.float32).view(
+            1, config.num_channels, 1, 1
+        )
+        self.register_buffer("image_mean", image_mean)
+        self.register_buffer("image_std", image_std)
+        self.decoder = GeneralDecoder(config, num_patches=config.num_patches)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, GeneralDecoder):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: Optional[torch.Tensor] = None,
+        zs: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        if hidden_states is None:
+            hidden_states = zs
+        if hidden_states is None:
+            raise ValueError("Pass SigLIP2 features as hidden_states or zs.")
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        decoder_output = self.decoder(
+            hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            drop_cls_token=self.config.drop_cls_token,
+        )
+        reconstruction = self.decoder.unpatchify(decoder_output.logits)
+        reconstruction = reconstruction * self.image_std + self.image_mean
+        if return_dict:
+            return F2PDecoderOutput(
+                reconstruction=reconstruction,
+                logits=decoder_output.logits,
+                hidden_states=decoder_output.hidden_states,
+                attentions=decoder_output.attentions,
+            )
+        return reconstruction
+    @torch.no_grad()
+    def infer(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return self.forward(hidden_states, return_dict=False)