mgvz commited on
Commit
5cc6343
1 Parent(s): 25b9ecc

Upload model

Browse files
Files changed (4) hide show
  1. config.json +53 -0
  2. configuration_dinov2.py +188 -0
  3. model.safetensors +3 -0
  4. modeling_dinov2.py +997 -0
config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/ubuntu/hibou-local-weight",
3
+ "apply_layernorm": true,
4
+ "architectures": [
5
+ "Dinov2ModelWithRegisters"
6
+ ],
7
+ "attention_probs_dropout_prob": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_dinov2.Dinov2Config",
10
+ "AutoModel": "modeling_dinov2.Dinov2ModelWithRegisters"
11
+ },
12
+ "drop_path_rate": 0.0,
13
+ "hidden_act": "gelu",
14
+ "hidden_dropout_prob": 0.0,
15
+ "hidden_size": 768,
16
+ "image_size": 224,
17
+ "initializer_range": 0.02,
18
+ "layer_norm_eps": 1e-06,
19
+ "layerscale_value": 1.0,
20
+ "mlp_ratio": 4,
21
+ "model_type": "dinov2",
22
+ "num_attention_heads": 12,
23
+ "num_channels": 3,
24
+ "num_hidden_layers": 12,
25
+ "num_register_tokens": 4,
26
+ "out_features": [
27
+ "stage12"
28
+ ],
29
+ "out_indices": [
30
+ 12
31
+ ],
32
+ "patch_size": 14,
33
+ "qkv_bias": true,
34
+ "reshape_hidden_states": true,
35
+ "stage_names": [
36
+ "stem",
37
+ "stage1",
38
+ "stage2",
39
+ "stage3",
40
+ "stage4",
41
+ "stage5",
42
+ "stage6",
43
+ "stage7",
44
+ "stage8",
45
+ "stage9",
46
+ "stage10",
47
+ "stage11",
48
+ "stage12"
49
+ ],
50
+ "torch_dtype": "float32",
51
+ "transformers_version": "4.41.2",
52
+ "use_swiglu_ffn": true
53
+ }
configuration_dinov2.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ DINOv2 model configuration"""
16
+
17
+ from collections import OrderedDict
18
+ from typing import Mapping
19
+
20
+ from packaging import version
21
+
22
+ from transformers.configuration_utils import PretrainedConfig
23
+ from transformers.onnx import OnnxConfig
24
+ from transformers.utils import logging
25
+ from transformers.utils.backbone_utils import (
26
+ BackboneConfigMixin,
27
+ get_aligned_output_features_output_indices,
28
+ )
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+
34
+ class Dinov2Config(BackboneConfigMixin, PretrainedConfig):
35
+ r"""
36
+ This is the configuration class to store the configuration of a [`Dinov2Model`]. It is used to instantiate an
37
+ Dinov2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
38
+ with the defaults will yield a similar configuration to that of the Dinov2
39
+ [google/dinov2-base-patch16-224](https://huggingface.co/google/dinov2-base-patch16-224) architecture.
40
+
41
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
42
+ documentation from [`PretrainedConfig`] for more information.
43
+
44
+ Args:
45
+ hidden_size (`int`, *optional*, defaults to 768):
46
+ Dimensionality of the encoder layers and the pooler layer.
47
+ num_hidden_layers (`int`, *optional*, defaults to 12):
48
+ Number of hidden layers in the Transformer encoder.
49
+ num_attention_heads (`int`, *optional*, defaults to 12):
50
+ Number of attention heads for each attention layer in the Transformer encoder.
51
+ mlp_ratio (`int`, *optional*, defaults to 4):
52
+ Ratio of the hidden size of the MLPs relative to the `hidden_size`.
53
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
54
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
55
+ `"relu"`, `"selu"` and `"gelu_new"` are supported.
56
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
57
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
58
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
59
+ The dropout ratio for the attention probabilities.
60
+ initializer_range (`float`, *optional*, defaults to 0.02):
61
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
62
+ layer_norm_eps (`float`, *optional*, defaults to 1e-06):
63
+ The epsilon used by the layer normalization layers.
64
+ image_size (`int`, *optional*, defaults to 224):
65
+ The size (resolution) of each image.
66
+ patch_size (`int`, *optional*, defaults to 16):
67
+ The size (resolution) of each patch.
68
+ num_channels (`int`, *optional*, defaults to 3):
69
+ The number of input channels.
70
+ qkv_bias (`bool`, *optional*, defaults to `True`):
71
+ Whether to add a bias to the queries, keys and values.
72
+ layerscale_value (`float`, *optional*, defaults to 1.0):
73
+ Initial value to use for layer scale.
74
+ drop_path_rate (`float`, *optional*, defaults to 0.0):
75
+ Stochastic depth rate per sample (when applied in the main path of residual layers).
76
+ use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
77
+ Whether to use the SwiGLU feedforward neural network.
78
+ out_features (`List[str]`, *optional*):
79
+ If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
80
+ (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
81
+ corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
82
+ same order as defined in the `stage_names` attribute.
83
+ out_indices (`List[int]`, *optional*):
84
+ If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
85
+ many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
86
+ If unset and `out_features` is unset, will default to the last stage. Must be in the
87
+ same order as defined in the `stage_names` attribute.
88
+ apply_layernorm (`bool`, *optional*, defaults to `True`):
89
+ Whether to apply layer normalization to the feature maps in case the model is used as backbone.
90
+ reshape_hidden_states (`bool`, *optional*, defaults to `True`):
91
+ Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
92
+ case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
93
+ seq_len, hidden_size)`.
94
+
95
+ Example:
96
+
97
+ ```python
98
+ >>> from transformers import Dinov2Config, Dinov2Model
99
+
100
+ >>> # Initializing a Dinov2 dinov2-base-patch16-224 style configuration
101
+ >>> configuration = Dinov2Config()
102
+
103
+ >>> # Initializing a model (with random weights) from the dinov2-base-patch16-224 style configuration
104
+ >>> model = Dinov2Model(configuration)
105
+
106
+ >>> # Accessing the model configuration
107
+ >>> configuration = model.config
108
+ ```"""
109
+
110
+ model_type = "dinov2"
111
+
112
+ def __init__(
113
+ self,
114
+ hidden_size=768,
115
+ num_hidden_layers=12,
116
+ num_attention_heads=12,
117
+ mlp_ratio=4,
118
+ hidden_act="gelu",
119
+ hidden_dropout_prob=0.0,
120
+ attention_probs_dropout_prob=0.0,
121
+ initializer_range=0.02,
122
+ layer_norm_eps=1e-6,
123
+ image_size=224,
124
+ patch_size=16,
125
+ num_channels=3,
126
+ qkv_bias=True,
127
+ layerscale_value=1.0,
128
+ drop_path_rate=0.0,
129
+ use_swiglu_ffn=False,
130
+ out_features=None,
131
+ out_indices=None,
132
+ apply_layernorm=True,
133
+ reshape_hidden_states=True,
134
+ num_register_tokens=0,
135
+ **kwargs,
136
+ ):
137
+ super().__init__(**kwargs)
138
+
139
+ self.hidden_size = hidden_size
140
+ self.num_hidden_layers = num_hidden_layers
141
+ self.num_attention_heads = num_attention_heads
142
+ self.mlp_ratio = mlp_ratio
143
+ self.hidden_act = hidden_act
144
+ self.hidden_dropout_prob = hidden_dropout_prob
145
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
146
+ self.initializer_range = initializer_range
147
+ self.layer_norm_eps = layer_norm_eps
148
+ self.image_size = image_size
149
+ self.patch_size = patch_size
150
+ self.num_channels = num_channels
151
+ self.qkv_bias = qkv_bias
152
+ self.layerscale_value = layerscale_value
153
+ self.drop_path_rate = drop_path_rate
154
+ self.use_swiglu_ffn = use_swiglu_ffn
155
+ self.stage_names = ["stem"] + [
156
+ f"stage{idx}" for idx in range(1, num_hidden_layers + 1)
157
+ ]
158
+ (
159
+ self._out_features,
160
+ self._out_indices,
161
+ ) = get_aligned_output_features_output_indices(
162
+ out_features=out_features,
163
+ out_indices=out_indices,
164
+ stage_names=self.stage_names,
165
+ )
166
+ self.apply_layernorm = apply_layernorm
167
+ self.reshape_hidden_states = reshape_hidden_states
168
+ # add register tokens
169
+ self.num_register_tokens = num_register_tokens
170
+
171
+
172
+ class Dinov2OnnxConfig(OnnxConfig):
173
+ torch_onnx_minimum_version = version.parse("1.11")
174
+
175
+ @property
176
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
177
+ return OrderedDict(
178
+ [
179
+ (
180
+ "pixel_values",
181
+ {0: "batch", 1: "num_channels", 2: "height", 3: "width"},
182
+ ),
183
+ ]
184
+ )
185
+
186
+ @property
187
+ def atol_for_validation(self) -> float:
188
+ return 1e-4
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2121db3cac83bc9abf13a458a37b0740e2ce725609ff8d4b8e6b6c56c30c3ab6
3
+ size 342988656
modeling_dinov2.py ADDED
@@ -0,0 +1,997 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Meta AI and The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch DINOv2 model."""
16
+
17
+
18
+ import collections.abc
19
+ import math
20
+ from typing import Dict, List, Optional, Set, Tuple, Union
21
+
22
+ import torch
23
+ import torch.utils.checkpoint
24
+ from torch import nn
25
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
26
+
27
+ from transformers.activations import ACT2FN
28
+ from transformers.modeling_outputs import (
29
+ BackboneOutput,
30
+ BaseModelOutput,
31
+ BaseModelOutputWithPooling,
32
+ ImageClassifierOutput,
33
+ )
34
+ from transformers.modeling_utils import PreTrainedModel
35
+ from transformers.pytorch_utils import (
36
+ find_pruneable_heads_and_indices,
37
+ prune_linear_layer,
38
+ )
39
+ from transformers.utils import (
40
+ add_code_sample_docstrings,
41
+ add_start_docstrings,
42
+ add_start_docstrings_to_model_forward,
43
+ logging,
44
+ replace_return_docstrings,
45
+ )
46
+ from transformers.utils.backbone_utils import BackboneMixin
47
+ from .configuration_dinov2 import Dinov2Config
48
+
49
+
50
+ logger = logging.get_logger(__name__)
51
+
52
+ # General docstring
53
+ _CONFIG_FOR_DOC = "Dinov2Config"
54
+
55
+ # Base docstring
56
+ _CHECKPOINT_FOR_DOC = "facebook/dinov2-base"
57
+ _EXPECTED_OUTPUT_SHAPE = [1, 257, 768]
58
+
59
+ # Image classification docstring
60
+ _IMAGE_CLASS_CHECKPOINT = "facebook/dinov2-small-imagenet1k-1-layer"
61
+ _IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
62
+
63
+
64
+ class Dinov2EmbeddingsWithRegisters(nn.Module):
65
+ """
66
+ Construct the CLS token, mask token, position and patch embeddings.
67
+ """
68
+
69
+ def __init__(self, config: Dinov2Config) -> None:
70
+ super().__init__()
71
+
72
+ self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
73
+ self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
74
+ self.patch_embeddings = Dinov2PatchEmbeddings(config)
75
+ num_patches = self.patch_embeddings.num_patches
76
+ self.position_embeddings = nn.Parameter(
77
+ torch.randn(1, num_patches + 1, config.hidden_size)
78
+ )
79
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
80
+ self.config = config
81
+ # add registers
82
+ if config.num_register_tokens > 0:
83
+ self.register_tokens = nn.Parameter(
84
+ torch.randn(1, config.num_register_tokens, config.hidden_size)
85
+ )
86
+ else:
87
+ self.register_tokens = None
88
+
89
+ def interpolate_pos_encoding(
90
+ self, embeddings: torch.Tensor, height: int, width: int
91
+ ) -> torch.Tensor:
92
+ """
93
+ This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
94
+ resolution images.
95
+
96
+ Source:
97
+ https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
98
+ """
99
+
100
+ num_patches = embeddings.shape[1] - 1
101
+ num_positions = self.position_embeddings.shape[1] - 1
102
+ if num_patches == num_positions and height == width:
103
+ return self.position_embeddings
104
+ class_pos_embed = self.position_embeddings[:, 0]
105
+ patch_pos_embed = self.position_embeddings[:, 1:]
106
+ dim = embeddings.shape[-1]
107
+ height = height // self.config.patch_size
108
+ width = width // self.config.patch_size
109
+ # we add a small number to avoid floating point error in the interpolation
110
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
111
+ height, width = height + 0.1, width + 0.1
112
+ patch_pos_embed = patch_pos_embed.reshape(
113
+ 1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim
114
+ )
115
+ patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
116
+ target_dtype = patch_pos_embed.dtype
117
+ patch_pos_embed = nn.functional.interpolate(
118
+ patch_pos_embed.to(dtype=torch.float32),
119
+ scale_factor=(
120
+ float(height / math.sqrt(num_positions)),
121
+ float(width / math.sqrt(num_positions)),
122
+ ),
123
+ mode="bicubic",
124
+ align_corners=False,
125
+ ).to(dtype=target_dtype)
126
+ if (
127
+ int(height) != patch_pos_embed.shape[-2]
128
+ or int(width) != patch_pos_embed.shape[-1]
129
+ ):
130
+ raise ValueError(
131
+ "Width or height does not match with the interpolated position embeddings"
132
+ )
133
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
134
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
135
+
136
+ def forward(
137
+ self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None
138
+ ) -> torch.Tensor:
139
+ batch_size, _, height, width = pixel_values.shape
140
+ target_dtype = self.patch_embeddings.projection.weight.dtype
141
+ embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
142
+
143
+ if bool_masked_pos is not None:
144
+ embeddings = torch.where(
145
+ bool_masked_pos.unsqueeze(-1),
146
+ self.mask_token.to(embeddings.dtype).unsqueeze(0),
147
+ embeddings,
148
+ )
149
+
150
+ # add the [CLS] token to the embedded patch tokens
151
+ cls_tokens = self.cls_token.expand(batch_size, -1, -1)
152
+ embeddings = torch.cat((cls_tokens, embeddings), dim=1)
153
+
154
+ # add positional encoding to each token
155
+ embeddings = embeddings + self.interpolate_pos_encoding(
156
+ embeddings, height, width
157
+ )
158
+
159
+ # add register tokens
160
+ if self.register_tokens is not None:
161
+ embeddings = torch.cat(
162
+ (
163
+ embeddings[:, :1],
164
+ self.register_tokens.expand(embeddings.shape[0], -1, -1),
165
+ embeddings[:, 1:],
166
+ ),
167
+ dim=1,
168
+ )
169
+
170
+ embeddings = self.dropout(embeddings)
171
+
172
+ return embeddings
173
+
174
+
175
+ class Dinov2PatchEmbeddings(nn.Module):
176
+ """
177
+ This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
178
+ `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
179
+ Transformer.
180
+ """
181
+
182
+ def __init__(self, config):
183
+ super().__init__()
184
+ image_size, patch_size = config.image_size, config.patch_size
185
+ num_channels, hidden_size = config.num_channels, config.hidden_size
186
+
187
+ image_size = (
188
+ image_size
189
+ if isinstance(image_size, collections.abc.Iterable)
190
+ else (image_size, image_size)
191
+ )
192
+ patch_size = (
193
+ patch_size
194
+ if isinstance(patch_size, collections.abc.Iterable)
195
+ else (patch_size, patch_size)
196
+ )
197
+ num_patches = (image_size[1] // patch_size[1]) * (
198
+ image_size[0] // patch_size[0]
199
+ )
200
+ self.image_size = image_size
201
+ self.patch_size = patch_size
202
+ self.num_channels = num_channels
203
+ self.num_patches = num_patches
204
+
205
+ self.projection = nn.Conv2d(
206
+ num_channels, hidden_size, kernel_size=patch_size, stride=patch_size
207
+ )
208
+
209
+ def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
210
+ num_channels = pixel_values.shape[1]
211
+ if num_channels != self.num_channels:
212
+ raise ValueError(
213
+ "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
214
+ f" Expected {self.num_channels} but got {num_channels}."
215
+ )
216
+ embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
217
+ return embeddings
218
+
219
+
220
+ # Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->Dinov2
221
+ class Dinov2SelfAttention(nn.Module):
222
+ def __init__(self, config: Dinov2Config) -> None:
223
+ super().__init__()
224
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(
225
+ config, "embedding_size"
226
+ ):
227
+ raise ValueError(
228
+ f"The hidden size {config.hidden_size,} is not a multiple of the number of attention "
229
+ f"heads {config.num_attention_heads}."
230
+ )
231
+
232
+ self.num_attention_heads = config.num_attention_heads
233
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
234
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
235
+
236
+ self.query = nn.Linear(
237
+ config.hidden_size, self.all_head_size, bias=config.qkv_bias
238
+ )
239
+ self.key = nn.Linear(
240
+ config.hidden_size, self.all_head_size, bias=config.qkv_bias
241
+ )
242
+ self.value = nn.Linear(
243
+ config.hidden_size, self.all_head_size, bias=config.qkv_bias
244
+ )
245
+
246
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
247
+
248
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
249
+ new_x_shape = x.size()[:-1] + (
250
+ self.num_attention_heads,
251
+ self.attention_head_size,
252
+ )
253
+ x = x.view(new_x_shape)
254
+ return x.permute(0, 2, 1, 3)
255
+
256
+ def forward(
257
+ self,
258
+ hidden_states,
259
+ head_mask: Optional[torch.Tensor] = None,
260
+ output_attentions: bool = False,
261
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
262
+ mixed_query_layer = self.query(hidden_states)
263
+
264
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
265
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
266
+ query_layer = self.transpose_for_scores(mixed_query_layer)
267
+
268
+ # Take the dot product between "query" and "key" to get the raw attention scores.
269
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
270
+
271
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
272
+
273
+ # Normalize the attention scores to probabilities.
274
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
275
+
276
+ # This is actually dropping out entire tokens to attend to, which might
277
+ # seem a bit unusual, but is taken from the original Transformer paper.
278
+ attention_probs = self.dropout(attention_probs)
279
+
280
+ # Mask heads if we want to
281
+ if head_mask is not None:
282
+ attention_probs = attention_probs * head_mask
283
+
284
+ context_layer = torch.matmul(attention_probs, value_layer)
285
+
286
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
287
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
288
+ context_layer = context_layer.view(new_context_layer_shape)
289
+
290
+ outputs = (
291
+ (context_layer, attention_probs) if output_attentions else (context_layer,)
292
+ )
293
+
294
+ return outputs
295
+
296
+
297
+ # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2
298
+ class Dinov2SelfOutput(nn.Module):
299
+ """
300
+ The residual connection is defined in Dinov2Layer instead of here (as is the case with other models), due to the
301
+ layernorm applied before each block.
302
+ """
303
+
304
+ def __init__(self, config: Dinov2Config) -> None:
305
+ super().__init__()
306
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
307
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
308
+
309
+ def forward(
310
+ self, hidden_states: torch.Tensor, input_tensor: torch.Tensor
311
+ ) -> torch.Tensor:
312
+ hidden_states = self.dense(hidden_states)
313
+ hidden_states = self.dropout(hidden_states)
314
+
315
+ return hidden_states
316
+
317
+
318
+ # Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->Dinov2
319
+ class Dinov2Attention(nn.Module):
320
+ def __init__(self, config: Dinov2Config) -> None:
321
+ super().__init__()
322
+ self.attention = Dinov2SelfAttention(config)
323
+ self.output = Dinov2SelfOutput(config)
324
+ self.pruned_heads = set()
325
+
326
+ def prune_heads(self, heads: Set[int]) -> None:
327
+ if len(heads) == 0:
328
+ return
329
+ heads, index = find_pruneable_heads_and_indices(
330
+ heads,
331
+ self.attention.num_attention_heads,
332
+ self.attention.attention_head_size,
333
+ self.pruned_heads,
334
+ )
335
+
336
+ # Prune linear layers
337
+ self.attention.query = prune_linear_layer(self.attention.query, index)
338
+ self.attention.key = prune_linear_layer(self.attention.key, index)
339
+ self.attention.value = prune_linear_layer(self.attention.value, index)
340
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
341
+
342
+ # Update hyper params and store pruned heads
343
+ self.attention.num_attention_heads = self.attention.num_attention_heads - len(
344
+ heads
345
+ )
346
+ self.attention.all_head_size = (
347
+ self.attention.attention_head_size * self.attention.num_attention_heads
348
+ )
349
+ self.pruned_heads = self.pruned_heads.union(heads)
350
+
351
+ def forward(
352
+ self,
353
+ hidden_states: torch.Tensor,
354
+ head_mask: Optional[torch.Tensor] = None,
355
+ output_attentions: bool = False,
356
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
357
+ self_outputs = self.attention(hidden_states, head_mask, output_attentions)
358
+
359
+ attention_output = self.output(self_outputs[0], hidden_states)
360
+
361
+ outputs = (attention_output,) + self_outputs[
362
+ 1:
363
+ ] # add attentions if we output them
364
+ return outputs
365
+
366
+
367
+ class Dinov2LayerScale(nn.Module):
368
+ def __init__(self, config) -> None:
369
+ super().__init__()
370
+ self.lambda1 = nn.Parameter(
371
+ config.layerscale_value * torch.ones(config.hidden_size)
372
+ )
373
+
374
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
375
+ return hidden_state * self.lambda1
376
+
377
+
378
+ # Copied from transformers.models.beit.modeling_beit.drop_path
379
+ def drop_path(
380
+ input: torch.Tensor, drop_prob: float = 0.0, training: bool = False
381
+ ) -> torch.Tensor:
382
+ """
383
+ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
384
+
385
+ Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
386
+ however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
387
+ See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
388
+ layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
389
+ argument.
390
+ """
391
+ if drop_prob == 0.0 or not training:
392
+ return input
393
+ keep_prob = 1 - drop_prob
394
+ shape = (input.shape[0],) + (1,) * (
395
+ input.ndim - 1
396
+ ) # work with diff dim tensors, not just 2D ConvNets
397
+ random_tensor = keep_prob + torch.rand(
398
+ shape, dtype=input.dtype, device=input.device
399
+ )
400
+ random_tensor.floor_() # binarize
401
+ output = input.div(keep_prob) * random_tensor
402
+ return output
403
+
404
+
405
+ # Copied from transformers.models.beit.modeling_beit.BeitDropPath
406
+ class Dinov2DropPath(nn.Module):
407
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
408
+
409
+ def __init__(self, drop_prob: Optional[float] = None) -> None:
410
+ super().__init__()
411
+ self.drop_prob = drop_prob
412
+
413
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
414
+ return drop_path(hidden_states, self.drop_prob, self.training)
415
+
416
+ def extra_repr(self) -> str:
417
+ return "p={}".format(self.drop_prob)
418
+
419
+
420
+ class Dinov2MLP(nn.Module):
421
+ def __init__(self, config) -> None:
422
+ super().__init__()
423
+ in_features = out_features = config.hidden_size
424
+ hidden_features = int(config.hidden_size * config.mlp_ratio)
425
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
426
+ if isinstance(config.hidden_act, str):
427
+ self.activation = ACT2FN[config.hidden_act]
428
+ else:
429
+ self.activation = config.hidden_act
430
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
431
+
432
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
433
+ hidden_state = self.fc1(hidden_state)
434
+ hidden_state = self.activation(hidden_state)
435
+ hidden_state = self.fc2(hidden_state)
436
+ return hidden_state
437
+
438
+
439
+ class Dinov2SwiGLUFFN(nn.Module):
440
+ def __init__(self, config) -> None:
441
+ super().__init__()
442
+ in_features = out_features = config.hidden_size
443
+ hidden_features = int(config.hidden_size * config.mlp_ratio)
444
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
445
+
446
+ self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
447
+ self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
448
+
449
+ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
450
+ hidden_state = self.weights_in(hidden_state)
451
+ x1, x2 = hidden_state.chunk(2, dim=-1)
452
+ hidden = nn.functional.silu(x1) * x2
453
+ return self.weights_out(hidden)
454
+
455
+
456
+ class Dinov2Layer(nn.Module):
457
+ """This corresponds to the Block class in the original implementation."""
458
+
459
+ def __init__(self, config: Dinov2Config) -> None:
460
+ super().__init__()
461
+
462
+ self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
463
+ self.attention = Dinov2Attention(config)
464
+ self.layer_scale1 = Dinov2LayerScale(config)
465
+ self.drop_path = (
466
+ Dinov2DropPath(config.drop_path_rate)
467
+ if config.drop_path_rate > 0.0
468
+ else nn.Identity()
469
+ )
470
+
471
+ self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
472
+
473
+ if config.use_swiglu_ffn:
474
+ self.mlp = Dinov2SwiGLUFFN(config)
475
+ else:
476
+ self.mlp = Dinov2MLP(config)
477
+ self.layer_scale2 = Dinov2LayerScale(config)
478
+
479
+ def forward(
480
+ self,
481
+ hidden_states: torch.Tensor,
482
+ head_mask: Optional[torch.Tensor] = None,
483
+ output_attentions: bool = False,
484
+ ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]:
485
+ self_attention_outputs = self.attention(
486
+ self.norm1(
487
+ hidden_states
488
+ ), # in Dinov2, layernorm is applied before self-attention
489
+ head_mask,
490
+ output_attentions=output_attentions,
491
+ )
492
+ attention_output = self_attention_outputs[0]
493
+
494
+ attention_output = self.layer_scale1(attention_output)
495
+ outputs = self_attention_outputs[
496
+ 1:
497
+ ] # add self attentions if we output attention weights
498
+
499
+ # first residual connection
500
+ hidden_states = self.drop_path(attention_output) + hidden_states
501
+
502
+ # in Dinov2, layernorm is also applied after self-attention
503
+ layer_output = self.norm2(hidden_states)
504
+ layer_output = self.mlp(layer_output)
505
+ layer_output = self.layer_scale2(layer_output)
506
+
507
+ # second residual connection
508
+ layer_output = self.drop_path(layer_output) + hidden_states
509
+
510
+ outputs = (layer_output,) + outputs
511
+
512
+ return outputs
513
+
514
+
515
+ # Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->Dinov2
516
+ class Dinov2Encoder(nn.Module):
517
+ def __init__(self, config: Dinov2Config) -> None:
518
+ super().__init__()
519
+ self.config = config
520
+ self.layer = nn.ModuleList(
521
+ [Dinov2Layer(config) for _ in range(config.num_hidden_layers)]
522
+ )
523
+ self.gradient_checkpointing = False
524
+
525
+ def forward(
526
+ self,
527
+ hidden_states: torch.Tensor,
528
+ head_mask: Optional[torch.Tensor] = None,
529
+ output_attentions: bool = False,
530
+ output_hidden_states: bool = False,
531
+ return_dict: bool = True,
532
+ ) -> Union[tuple, BaseModelOutput]:
533
+ all_hidden_states = () if output_hidden_states else None
534
+ all_self_attentions = () if output_attentions else None
535
+
536
+ for i, layer_module in enumerate(self.layer):
537
+ if output_hidden_states:
538
+ all_hidden_states = all_hidden_states + (hidden_states,)
539
+
540
+ layer_head_mask = head_mask[i] if head_mask is not None else None
541
+
542
+ if self.gradient_checkpointing and self.training:
543
+ layer_outputs = self._gradient_checkpointing_func(
544
+ layer_module.__call__,
545
+ hidden_states,
546
+ layer_head_mask,
547
+ output_attentions,
548
+ )
549
+ else:
550
+ layer_outputs = layer_module(
551
+ hidden_states, layer_head_mask, output_attentions
552
+ )
553
+
554
+ hidden_states = layer_outputs[0]
555
+
556
+ if output_attentions:
557
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
558
+
559
+ if output_hidden_states:
560
+ all_hidden_states = all_hidden_states + (hidden_states,)
561
+
562
+ if not return_dict:
563
+ return tuple(
564
+ v
565
+ for v in [hidden_states, all_hidden_states, all_self_attentions]
566
+ if v is not None
567
+ )
568
+ return BaseModelOutput(
569
+ last_hidden_state=hidden_states,
570
+ hidden_states=all_hidden_states,
571
+ attentions=all_self_attentions,
572
+ )
573
+
574
+
575
+ class Dinov2PreTrainedModel(PreTrainedModel):
576
+ """
577
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
578
+ models.
579
+ """
580
+
581
+ config_class = Dinov2Config
582
+ base_model_prefix = "dinov2"
583
+ main_input_name = "pixel_values"
584
+ supports_gradient_checkpointing = True
585
+ _no_split_modules = ["Dinov2SwiGLUFFN"]
586
+
587
+ def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
588
+ """Initialize the weights"""
589
+ if isinstance(module, (nn.Linear, nn.Conv2d)):
590
+ # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
591
+ # `trunc_normal_cpu` not implemented in `half` issues
592
+ module.weight.data = nn.init.trunc_normal_(
593
+ module.weight.data.to(torch.float32),
594
+ mean=0.0,
595
+ std=self.config.initializer_range,
596
+ ).to(module.weight.dtype)
597
+ if module.bias is not None:
598
+ module.bias.data.zero_()
599
+ elif isinstance(module, nn.LayerNorm):
600
+ module.bias.data.zero_()
601
+ module.weight.data.fill_(1.0)
602
+ elif isinstance(module, Dinov2EmbeddingsWithRegisters):
603
+ module.position_embeddings.data = nn.init.trunc_normal_(
604
+ module.position_embeddings.data.to(torch.float32),
605
+ mean=0.0,
606
+ std=self.config.initializer_range,
607
+ ).to(module.position_embeddings.dtype)
608
+
609
+ module.cls_token.data = nn.init.trunc_normal_(
610
+ module.cls_token.data.to(torch.float32),
611
+ mean=0.0,
612
+ std=self.config.initializer_range,
613
+ ).to(module.cls_token.dtype)
614
+ if module.register_tokens is not None:
615
+ module.register_tokens.data = nn.init.trunc_normal_(
616
+ module.register_tokens.data.to(torch.float32),
617
+ mean=0.0,
618
+ std=self.config.initializer_range,
619
+ ).to(module.register_tokens.dtype)
620
+
621
+
622
+ DINOV2_START_DOCSTRING = r"""
623
+ This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
624
+ as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
625
+ behavior.
626
+
627
+ Parameters:
628
+ config ([`Dinov2Config`]): Model configuration class with all the parameters of the model.
629
+ Initializing with a config file does not load the weights associated with the model, only the
630
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
631
+ """
632
+
633
+ DINOV2_BASE_INPUTS_DOCSTRING = r"""
634
+ Args:
635
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
636
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
637
+ [`BitImageProcessor.preprocess`] for details.
638
+
639
+ bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
640
+ Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
641
+ pre-training.
642
+
643
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
644
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
645
+
646
+ - 1 indicates the head is **not masked**,
647
+ - 0 indicates the head is **masked**.
648
+
649
+ output_attentions (`bool`, *optional*):
650
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
651
+ tensors for more detail.
652
+ output_hidden_states (`bool`, *optional*):
653
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
654
+ more detail.
655
+ return_dict (`bool`, *optional*):
656
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
657
+ """
658
+
659
+ DINOV2_INPUTS_DOCSTRING = r"""
660
+ Args:
661
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
662
+ Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
663
+ [`BitImageProcessor.preprocess`] for details.
664
+
665
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
666
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
667
+
668
+ - 1 indicates the head is **not masked**,
669
+ - 0 indicates the head is **masked**.
670
+
671
+ output_attentions (`bool`, *optional*):
672
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
673
+ tensors for more detail.
674
+ output_hidden_states (`bool`, *optional*):
675
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
676
+ more detail.
677
+ return_dict (`bool`, *optional*):
678
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
679
+ """
680
+
681
+
682
+ @add_start_docstrings(
683
+ "The bare DINOv2 Model transformer outputting raw hidden-states without any specific head on top.",
684
+ DINOV2_START_DOCSTRING,
685
+ )
686
+ class Dinov2ModelWithRegisters(Dinov2PreTrainedModel):
687
+ def __init__(self, config: Dinov2Config):
688
+ super().__init__(config)
689
+ self.config = config
690
+
691
+ self.embeddings = Dinov2EmbeddingsWithRegisters(config)
692
+ self.encoder = Dinov2Encoder(config)
693
+
694
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
695
+
696
+ # Initialize weights and apply final processing
697
+ self.post_init()
698
+
699
+ def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
700
+ return self.embeddings.patch_embeddings
701
+
702
+ def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None:
703
+ """
704
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
705
+ class PreTrainedModel
706
+ """
707
+ for layer, heads in heads_to_prune.items():
708
+ self.encoder.layer[layer].attention.prune_heads(heads)
709
+
710
+ @add_start_docstrings_to_model_forward(DINOV2_BASE_INPUTS_DOCSTRING)
711
+ @add_code_sample_docstrings(
712
+ checkpoint=_CHECKPOINT_FOR_DOC,
713
+ output_type=BaseModelOutputWithPooling,
714
+ config_class=_CONFIG_FOR_DOC,
715
+ modality="vision",
716
+ expected_output=_EXPECTED_OUTPUT_SHAPE,
717
+ )
718
+ def forward(
719
+ self,
720
+ pixel_values: Optional[torch.Tensor] = None,
721
+ bool_masked_pos: Optional[torch.Tensor] = None,
722
+ head_mask: Optional[torch.Tensor] = None,
723
+ output_attentions: Optional[bool] = None,
724
+ output_hidden_states: Optional[bool] = None,
725
+ return_dict: Optional[bool] = None,
726
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
727
+ output_attentions = (
728
+ output_attentions
729
+ if output_attentions is not None
730
+ else self.config.output_attentions
731
+ )
732
+ output_hidden_states = (
733
+ output_hidden_states
734
+ if output_hidden_states is not None
735
+ else self.config.output_hidden_states
736
+ )
737
+ return_dict = (
738
+ return_dict if return_dict is not None else self.config.use_return_dict
739
+ )
740
+
741
+ if pixel_values is None:
742
+ raise ValueError("You have to specify pixel_values")
743
+
744
+ # Prepare head mask if needed
745
+ # 1.0 in head_mask indicate we keep the head
746
+ # attention_probs has shape bsz x n_heads x N x N
747
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
748
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
749
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
750
+
751
+ embedding_output = self.embeddings(
752
+ pixel_values, bool_masked_pos=bool_masked_pos
753
+ )
754
+
755
+ encoder_outputs = self.encoder(
756
+ embedding_output,
757
+ head_mask=head_mask,
758
+ output_attentions=output_attentions,
759
+ output_hidden_states=output_hidden_states,
760
+ return_dict=return_dict,
761
+ )
762
+ sequence_output = encoder_outputs[0]
763
+ sequence_output = self.layernorm(sequence_output)
764
+ pooled_output = sequence_output[:, 0, :]
765
+
766
+ if not return_dict:
767
+ head_outputs = (sequence_output, pooled_output)
768
+ return head_outputs + encoder_outputs[1:]
769
+
770
+ return BaseModelOutputWithPooling(
771
+ last_hidden_state=sequence_output,
772
+ pooler_output=pooled_output,
773
+ hidden_states=encoder_outputs.hidden_states,
774
+ attentions=encoder_outputs.attentions,
775
+ )
776
+
777
+
778
+ @add_start_docstrings(
779
+ """
780
+ Dinov2 Model transformer with an image classification head on top (a linear layer on top of the final hidden state
781
+ of the [CLS] token) e.g. for ImageNet.
782
+ """,
783
+ DINOV2_START_DOCSTRING,
784
+ )
785
+ class Dinov2ForImageClassification(Dinov2PreTrainedModel):
786
+ def __init__(self, config: Dinov2Config) -> None:
787
+ super().__init__(config)
788
+
789
+ self.num_labels = config.num_labels
790
+ self.dinov2 = Dinov2ModelWithRegisters(config)
791
+
792
+ # Classifier head
793
+ self.classifier = (
794
+ nn.Linear(config.hidden_size * 2, config.num_labels)
795
+ if config.num_labels > 0
796
+ else nn.Identity()
797
+ )
798
+
799
+ # Initialize weights and apply final processing
800
+ self.post_init()
801
+
802
+ @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
803
+ @add_code_sample_docstrings(
804
+ checkpoint=_IMAGE_CLASS_CHECKPOINT,
805
+ output_type=ImageClassifierOutput,
806
+ config_class=_CONFIG_FOR_DOC,
807
+ expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
808
+ )
809
+ def forward(
810
+ self,
811
+ pixel_values: Optional[torch.Tensor] = None,
812
+ head_mask: Optional[torch.Tensor] = None,
813
+ labels: Optional[torch.Tensor] = None,
814
+ output_attentions: Optional[bool] = None,
815
+ output_hidden_states: Optional[bool] = None,
816
+ return_dict: Optional[bool] = None,
817
+ ) -> Union[tuple, ImageClassifierOutput]:
818
+ r"""
819
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
820
+ Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
821
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
822
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
823
+ """
824
+ return_dict = (
825
+ return_dict if return_dict is not None else self.config.use_return_dict
826
+ )
827
+
828
+ outputs = self.dinov2(
829
+ pixel_values,
830
+ head_mask=head_mask,
831
+ output_attentions=output_attentions,
832
+ output_hidden_states=output_hidden_states,
833
+ return_dict=return_dict,
834
+ )
835
+
836
+ sequence_output = outputs[0] # batch_size, sequence_length, hidden_size
837
+
838
+ cls_token = sequence_output[:, 0]
839
+ patch_tokens = sequence_output[:, 1:]
840
+
841
+ linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
842
+
843
+ logits = self.classifier(linear_input)
844
+
845
+ loss = None
846
+ if labels is not None:
847
+ # move labels to correct device to enable model parallelism
848
+ labels = labels.to(logits.device)
849
+ if self.config.problem_type is None:
850
+ if self.num_labels == 1:
851
+ self.config.problem_type = "regression"
852
+ elif self.num_labels > 1 and (
853
+ labels.dtype == torch.long or labels.dtype == torch.int
854
+ ):
855
+ self.config.problem_type = "single_label_classification"
856
+ else:
857
+ self.config.problem_type = "multi_label_classification"
858
+
859
+ if self.config.problem_type == "regression":
860
+ loss_fct = MSELoss()
861
+ if self.num_labels == 1:
862
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
863
+ else:
864
+ loss = loss_fct(logits, labels)
865
+ elif self.config.problem_type == "single_label_classification":
866
+ loss_fct = CrossEntropyLoss()
867
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
868
+ elif self.config.problem_type == "multi_label_classification":
869
+ loss_fct = BCEWithLogitsLoss()
870
+ loss = loss_fct(logits, labels)
871
+
872
+ if not return_dict:
873
+ output = (logits,) + outputs[2:]
874
+ return ((loss,) + output) if loss is not None else output
875
+
876
+ return ImageClassifierOutput(
877
+ loss=loss,
878
+ logits=logits,
879
+ hidden_states=outputs.hidden_states,
880
+ attentions=outputs.attentions,
881
+ )
882
+
883
+
884
+ @add_start_docstrings(
885
+ """
886
+ Dinov2 backbone, to be used with frameworks like DETR and MaskFormer.
887
+ """,
888
+ DINOV2_START_DOCSTRING,
889
+ )
890
+ class Dinov2Backbone(Dinov2PreTrainedModel, BackboneMixin):
891
+ def __init__(self, config):
892
+ super().__init__(config)
893
+ super()._init_backbone(config)
894
+
895
+ self.num_features = [
896
+ config.hidden_size for _ in range(config.num_hidden_layers + 1)
897
+ ]
898
+ self.embeddings = Dinov2EmbeddingsWithRegisters(config)
899
+ self.encoder = Dinov2Encoder(config)
900
+
901
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
902
+
903
+ # Initialize weights and apply final processing
904
+ self.post_init()
905
+
906
+ def get_input_embeddings(self) -> Dinov2PatchEmbeddings:
907
+ return self.embeddings.patch_embeddings
908
+
909
+ @add_start_docstrings_to_model_forward(DINOV2_INPUTS_DOCSTRING)
910
+ @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
911
+ def forward(
912
+ self,
913
+ pixel_values: torch.Tensor,
914
+ output_hidden_states: Optional[bool] = None,
915
+ output_attentions: Optional[bool] = None,
916
+ return_dict: Optional[bool] = None,
917
+ ) -> BackboneOutput:
918
+ """
919
+ Returns:
920
+
921
+ Examples:
922
+
923
+ ```python
924
+ >>> from transformers import AutoImageProcessor, AutoBackbone
925
+ >>> import torch
926
+ >>> from PIL import Image
927
+ >>> import requests
928
+
929
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
930
+ >>> image = Image.open(requests.get(url, stream=True).raw)
931
+
932
+ >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-base")
933
+ >>> model = AutoBackbone.from_pretrained(
934
+ ... "facebook/dinov2-base", out_features=["stage2", "stage5", "stage8", "stage11"]
935
+ ... )
936
+
937
+ >>> inputs = processor(image, return_tensors="pt")
938
+
939
+ >>> outputs = model(**inputs)
940
+ >>> feature_maps = outputs.feature_maps
941
+ >>> list(feature_maps[-1].shape)
942
+ [1, 768, 16, 16]
943
+ ```"""
944
+ return_dict = (
945
+ return_dict if return_dict is not None else self.config.use_return_dict
946
+ )
947
+ output_hidden_states = (
948
+ output_hidden_states
949
+ if output_hidden_states is not None
950
+ else self.config.output_hidden_states
951
+ )
952
+ output_attentions = (
953
+ output_attentions
954
+ if output_attentions is not None
955
+ else self.config.output_attentions
956
+ )
957
+
958
+ embedding_output = self.embeddings(pixel_values)
959
+
960
+ outputs = self.encoder(
961
+ embedding_output,
962
+ output_hidden_states=True,
963
+ output_attentions=output_attentions,
964
+ return_dict=return_dict,
965
+ )
966
+
967
+ hidden_states = outputs.hidden_states if return_dict else outputs[1]
968
+
969
+ feature_maps = ()
970
+ for stage, hidden_state in zip(self.stage_names, hidden_states):
971
+ if stage in self.out_features:
972
+ if self.config.apply_layernorm:
973
+ hidden_state = self.layernorm(hidden_state)
974
+ if self.config.reshape_hidden_states:
975
+ hidden_state = hidden_state[:, 1:]
976
+ # this was actually a bug in the original implementation that we copied here,
977
+ # cause normally the order is height, width
978
+ batch_size, _, height, width = pixel_values.shape
979
+ patch_size = self.config.patch_size
980
+ hidden_state = hidden_state.reshape(
981
+ batch_size, height // patch_size, width // patch_size, -1
982
+ )
983
+ hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
984
+ feature_maps += (hidden_state,)
985
+
986
+ if not return_dict:
987
+ if output_hidden_states:
988
+ output = (feature_maps,) + outputs[1:]
989
+ else:
990
+ output = (feature_maps,) + outputs[2:]
991
+ return output
992
+
993
+ return BackboneOutput(
994
+ feature_maps=feature_maps,
995
+ hidden_states=outputs.hidden_states if output_hidden_states else None,
996
+ attentions=outputs.attentions if output_attentions else None,
997
+ )