VictorSanh commited on
Commit
ffd5378
1 Parent(s): ebb4689

tiny random siglip

Browse files
config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "HuggingFaceM4/tiny-random-siglip",
3
+ "architectures": [
4
+ "SiglipModel"
5
+ ],
6
+ "auto_map": {
7
+ "AutoConfig": "HuggingFaceM4/tiny-random-siglip--configuration_siglip.SiglipConfig",
8
+ "AutoModel": "HuggingFaceM4/tiny-random-siglip--modeling_siglip.SiglipModel"
9
+ },
10
+ "initializer_factor": 1.0,
11
+ "logit_scale_init_value": 2.6592,
12
+ "model_type": "siglip",
13
+ "projection_dim": 512,
14
+ "text_config": {
15
+ "hidden_size": 144,
16
+ "intermediate_size": 538,
17
+ "model_type": "siglip_text_model",
18
+ "num_attention_heads": 2,
19
+ "num_hidden_layers": 3,
20
+ "projection_dim": 64,
21
+ "vocab_size": 32000
22
+ },
23
+ "torch_dtype": "float32",
24
+ "transformers_version": "4.35.2",
25
+ "vision_config": {
26
+ "hidden_size": 144,
27
+ "image_size": 30,
28
+ "intermediate_size": 538,
29
+ "model_type": "siglip_vision_model",
30
+ "num_attention_heads": 2,
31
+ "num_hidden_layers": 3,
32
+ "patch_size": 2,
33
+ "projection_dim": 64
34
+ }
35
+ }
configuration_siglip.py ADDED
@@ -0,0 +1,444 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ Siglip model configuration"""
16
+
17
+ import os
18
+ from collections import OrderedDict
19
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
20
+
21
+
22
+ if TYPE_CHECKING:
23
+ from transformers.processing_utils import ProcessorMixin
24
+ from transformers.utils import TensorType
25
+
26
+ from transformers.configuration_utils import PretrainedConfig
27
+ from transformers.onnx import OnnxConfig
28
+ from transformers.utils import logging
29
+
30
+
31
+ logger = logging.get_logger(__name__)
32
+
33
+ SIGLIP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
34
+ "google/siglip-base-patch16-224": "https://huggingface.co/google/siglip-base-patch16-224/resolve/main/config.json",
35
+ }
36
+
37
+
38
+ class SiglipTextConfig(PretrainedConfig):
39
+ r"""
40
+ This is the configuration class to store the configuration of a [`SiglipTextModel`]. It is used to instantiate a
41
+ Siglip text encoder according to the specified arguments, defining the model architecture. Instantiating a
42
+ configuration with the defaults will yield a similar configuration to that of the text encoder of the Siglip
43
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
44
+
45
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
46
+ documentation from [`PretrainedConfig`] for more information.
47
+
48
+ Args:
49
+ vocab_size (`int`, *optional*, defaults to 49408):
50
+ Vocabulary size of the Siglip text model. Defines the number of different tokens that can be represented by
51
+ the `inputs_ids` passed when calling [`SiglipModel`].
52
+ hidden_size (`int`, *optional*, defaults to 512):
53
+ Dimensionality of the encoder layers and the pooler layer.
54
+ intermediate_size (`int`, *optional*, defaults to 2048):
55
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
56
+ num_hidden_layers (`int`, *optional*, defaults to 12):
57
+ Number of hidden layers in the Transformer encoder.
58
+ num_attention_heads (`int`, *optional*, defaults to 8):
59
+ Number of attention heads for each attention layer in the Transformer encoder.
60
+ max_position_embeddings (`int`, *optional*, defaults to 64):
61
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
62
+ just in case (e.g., 512 or 1024 or 2048).
63
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
64
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
65
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
66
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
67
+ The epsilon used by the layer normalization layers.
68
+ attention_dropout (`float`, *optional*, defaults to 0.0):
69
+ The dropout ratio for the attention probabilities.
70
+ initializer_range (`float`, *optional*, defaults to 0.02):
71
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
72
+ initializer_factor (`float`, *optional*, defaults to 1):
73
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
74
+ testing).
75
+
76
+ Example:
77
+
78
+ ```python
79
+ >>> from transformers import SiglipTextConfig, SiglipTextModel
80
+
81
+ >>> # Initializing a SiglipTextConfig with google/siglip-base-patch16-224 style configuration
82
+ >>> configuration = SiglipTextConfig()
83
+
84
+ >>> # Initializing a SiglipTextModel (with random weights) from the google/siglip-base-patch16-224 style configuration
85
+ >>> model = SiglipTextModel(configuration)
86
+
87
+ >>> # Accessing the model configuration
88
+ >>> configuration = model.config
89
+ ```"""
90
+ model_type = "siglip_text_model"
91
+
92
+ def __init__(
93
+ self,
94
+ vocab_size=49408,
95
+ hidden_size=512,
96
+ intermediate_size=2048,
97
+ projection_dim=512,
98
+ num_hidden_layers=12,
99
+ num_attention_heads=8,
100
+ max_position_embeddings=64,
101
+ hidden_act="gelu_pytorch_tanh",
102
+ layer_norm_eps=1e-6,
103
+ attention_dropout=0.0,
104
+ initializer_range=0.02,
105
+ initializer_factor=1.0,
106
+ # This differs from `CLIPTokenizer`'s default and from openai/siglip
107
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
108
+ pad_token_id=1,
109
+ bos_token_id=49406,
110
+ eos_token_id=49407,
111
+ **kwargs,
112
+ ):
113
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
114
+
115
+ self.vocab_size = vocab_size
116
+ self.hidden_size = hidden_size
117
+ self.intermediate_size = intermediate_size
118
+ self.projection_dim = projection_dim
119
+ self.num_hidden_layers = num_hidden_layers
120
+ self.num_attention_heads = num_attention_heads
121
+ self.max_position_embeddings = max_position_embeddings
122
+ self.layer_norm_eps = layer_norm_eps
123
+ self.hidden_act = hidden_act
124
+ self.initializer_range = initializer_range
125
+ self.initializer_factor = initializer_factor
126
+ self.attention_dropout = attention_dropout
127
+
128
+ @classmethod
129
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
130
+ cls._set_token_in_kwargs(kwargs)
131
+
132
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
133
+
134
+ # get the text config dict if we are loading from SiglipConfig
135
+ if config_dict.get("model_type") == "siglip":
136
+ config_dict = config_dict["text_config"]
137
+
138
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
139
+ logger.warning(
140
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
141
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
142
+ )
143
+
144
+ return cls.from_dict(config_dict, **kwargs)
145
+
146
+
147
+ class SiglipVisionConfig(PretrainedConfig):
148
+ r"""
149
+ This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
150
+ Siglip vision encoder according to the specified arguments, defining the model architecture. Instantiating a
151
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the Siglip
152
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
153
+
154
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
155
+ documentation from [`PretrainedConfig`] for more information.
156
+
157
+ Args:
158
+ hidden_size (`int`, *optional*, defaults to 768):
159
+ Dimensionality of the encoder layers and the pooler layer.
160
+ intermediate_size (`int`, *optional*, defaults to 3072):
161
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
162
+ num_hidden_layers (`int`, *optional*, defaults to 12):
163
+ Number of hidden layers in the Transformer encoder.
164
+ num_attention_heads (`int`, *optional*, defaults to 12):
165
+ Number of attention heads for each attention layer in the Transformer encoder.
166
+ image_size (`int`, *optional*, defaults to 224):
167
+ The size (resolution) of each image.
168
+ patch_size (`int`, *optional*, defaults to 32):
169
+ The size (resolution) of each patch.
170
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
171
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
172
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
173
+ layer_norm_eps (`float`, *optional*, defaults to 1e-6):
174
+ The epsilon used by the layer normalization layers.
175
+ attention_dropout (`float`, *optional*, defaults to 0.0):
176
+ The dropout ratio for the attention probabilities.
177
+ initializer_range (`float`, *optional*, defaults to 0.02):
178
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
179
+ initializer_factor (`float`, *optional*, defaults to 1):
180
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
181
+ testing).
182
+
183
+ Example:
184
+
185
+ ```python
186
+ >>> from transformers import SiglipVisionConfig, SiglipVisionModel
187
+
188
+ >>> # Initializing a SiglipVisionConfig with google/siglip-base-patch16-224 style configuration
189
+ >>> configuration = SiglipVisionConfig()
190
+
191
+ >>> # Initializing a SiglipVisionModel (with random weights) from the google/siglip-base-patch16-224 style configuration
192
+ >>> model = SiglipVisionModel(configuration)
193
+
194
+ >>> # Accessing the model configuration
195
+ >>> configuration = model.config
196
+ ```"""
197
+
198
+ model_type = "siglip_vision_model"
199
+
200
+ def __init__(
201
+ self,
202
+ hidden_size=768,
203
+ intermediate_size=3072,
204
+ projection_dim=512,
205
+ num_hidden_layers=12,
206
+ num_attention_heads=12,
207
+ num_channels=3,
208
+ image_size=224,
209
+ patch_size=32,
210
+ hidden_act="gelu_pytorch_tanh",
211
+ layer_norm_eps=1e-6,
212
+ attention_dropout=0.0,
213
+ initializer_range=0.02,
214
+ initializer_factor=1.0,
215
+ **kwargs,
216
+ ):
217
+ super().__init__(**kwargs)
218
+
219
+ self.hidden_size = hidden_size
220
+ self.intermediate_size = intermediate_size
221
+ self.projection_dim = projection_dim
222
+ self.num_hidden_layers = num_hidden_layers
223
+ self.num_attention_heads = num_attention_heads
224
+ self.num_channels = num_channels
225
+ self.patch_size = patch_size
226
+ self.image_size = image_size
227
+ self.initializer_range = initializer_range
228
+ self.initializer_factor = initializer_factor
229
+ self.attention_dropout = attention_dropout
230
+ self.layer_norm_eps = layer_norm_eps
231
+ self.hidden_act = hidden_act
232
+
233
+ @classmethod
234
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
235
+ cls._set_token_in_kwargs(kwargs)
236
+
237
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
238
+
239
+ # get the vision config dict if we are loading from SiglipConfig
240
+ if config_dict.get("model_type") == "siglip":
241
+ config_dict = config_dict["vision_config"]
242
+
243
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
244
+ logger.warning(
245
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
246
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
247
+ )
248
+
249
+ return cls.from_dict(config_dict, **kwargs)
250
+
251
+
252
+ class SiglipConfig(PretrainedConfig):
253
+ r"""
254
+ [`SiglipConfig`] is the configuration class to store the configuration of a [`SiglipModel`]. It is used to
255
+ instantiate a Siglip model according to the specified arguments, defining the text model and vision model configs.
256
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the Siglip
257
+ [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) architecture.
258
+
259
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
260
+ documentation from [`PretrainedConfig`] for more information.
261
+
262
+ Args:
263
+ text_config (`dict`, *optional*):
264
+ Dictionary of configuration options used to initialize [`SiglipTextConfig`].
265
+ vision_config (`dict`, *optional*):
266
+ Dictionary of configuration options used to initialize [`SiglipVisionConfig`].
267
+ projection_dim (`int`, *optional*, defaults to 512):
268
+ Dimentionality of text and vision projection layers.
269
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
270
+ The inital value of the *logit_scale* paramter. Default is used as per the original Siglip implementation.
271
+ kwargs (*optional*):
272
+ Dictionary of keyword arguments.
273
+
274
+ Example:
275
+
276
+ ```python
277
+ >>> from transformers import SiglipConfig, SiglipModel
278
+
279
+ >>> # Initializing a SiglipConfig with google/siglip-base-patch16-224 style configuration
280
+ >>> configuration = SiglipConfig()
281
+
282
+ >>> # Initializing a SiglipModel (with random weights) from the google/siglip-base-patch16-224 style configuration
283
+ >>> model = SiglipModel(configuration)
284
+
285
+ >>> # Accessing the model configuration
286
+ >>> configuration = model.config
287
+
288
+ >>> # We can also initialize a SiglipConfig from a SiglipTextConfig and a SiglipVisionConfig
289
+ >>> from transformers import SiglipTextConfig, SiglipVisionConfig
290
+
291
+ >>> # Initializing a SiglipText and SiglipVision configuration
292
+ >>> config_text = SiglipTextConfig()
293
+ >>> config_vision = SiglipVisionConfig()
294
+
295
+ >>> config = SiglipConfig.from_text_vision_configs(config_text, config_vision)
296
+ ```"""
297
+
298
+ model_type = "siglip"
299
+
300
+ def __init__(
301
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
302
+ ):
303
+ # If `_config_dict` exist, we use them for the backward compatibility.
304
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
305
+ # of confusion!).
306
+ text_config_dict = kwargs.pop("text_config_dict", None)
307
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
308
+
309
+ super().__init__(**kwargs)
310
+
311
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
312
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
313
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
314
+ if text_config_dict is not None:
315
+ if text_config is None:
316
+ text_config = {}
317
+
318
+ # This is the complete result when using `text_config_dict`.
319
+ _text_config_dict = SiglipTextConfig(**text_config_dict).to_dict()
320
+
321
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
322
+ for key, value in _text_config_dict.items():
323
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
324
+ # If specified in `text_config_dict`
325
+ if key in text_config_dict:
326
+ message = (
327
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
328
+ f'The value `text_config_dict["{key}"]` will be used instead.'
329
+ )
330
+ # If inferred from default argument values (just to be super careful)
331
+ else:
332
+ message = (
333
+ f"`text_config_dict` is provided which will be used to initialize `SiglipTextConfig`. The "
334
+ f'value `text_config["{key}"]` will be overriden.'
335
+ )
336
+ logger.warning(message)
337
+
338
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
339
+ text_config.update(_text_config_dict)
340
+
341
+ if vision_config_dict is not None:
342
+ if vision_config is None:
343
+ vision_config = {}
344
+
345
+ # This is the complete result when using `vision_config_dict`.
346
+ _vision_config_dict = SiglipVisionConfig(**vision_config_dict).to_dict()
347
+ # convert keys to string instead of integer
348
+ if "id2label" in _vision_config_dict:
349
+ _vision_config_dict["id2label"] = {
350
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
351
+ }
352
+
353
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
354
+ for key, value in _vision_config_dict.items():
355
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
356
+ # If specified in `vision_config_dict`
357
+ if key in vision_config_dict:
358
+ message = (
359
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
360
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
361
+ )
362
+ # If inferred from default argument values (just to be super careful)
363
+ else:
364
+ message = (
365
+ f"`vision_config_dict` is provided which will be used to initialize `SiglipVisionConfig`. "
366
+ f'The value `vision_config["{key}"]` will be overriden.'
367
+ )
368
+ logger.warning(message)
369
+
370
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
371
+ vision_config.update(_vision_config_dict)
372
+
373
+ if text_config is None:
374
+ text_config = {}
375
+ logger.info("`text_config` is `None`. Initializing the `SiglipTextConfig` with default values.")
376
+
377
+ if vision_config is None:
378
+ vision_config = {}
379
+ logger.info("`vision_config` is `None`. initializing the `SiglipVisionConfig` with default values.")
380
+
381
+ self.text_config = SiglipTextConfig(**text_config)
382
+ self.vision_config = SiglipVisionConfig(**vision_config)
383
+
384
+ self.projection_dim = projection_dim
385
+ self.logit_scale_init_value = logit_scale_init_value
386
+ self.initializer_factor = 1.0
387
+
388
+ @classmethod
389
+ def from_text_vision_configs(cls, text_config: SiglipTextConfig, vision_config: SiglipVisionConfig, **kwargs):
390
+ r"""
391
+ Instantiate a [`SiglipConfig`] (or a derived class) from siglip text model configuration and siglip vision
392
+ model configuration.
393
+
394
+ Returns:
395
+ [`SiglipConfig`]: An instance of a configuration object
396
+ """
397
+
398
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
399
+
400
+
401
+ class SiglipOnnxConfig(OnnxConfig):
402
+ @property
403
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
404
+ return OrderedDict(
405
+ [
406
+ ("input_ids", {0: "batch", 1: "sequence"}),
407
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
408
+ ("attention_mask", {0: "batch", 1: "sequence"}),
409
+ ]
410
+ )
411
+
412
+ @property
413
+ def outputs(self) -> Mapping[str, Mapping[int, str]]:
414
+ return OrderedDict(
415
+ [
416
+ ("logits_per_image", {0: "batch"}),
417
+ ("logits_per_text", {0: "batch"}),
418
+ ("text_embeds", {0: "batch"}),
419
+ ("image_embeds", {0: "batch"}),
420
+ ]
421
+ )
422
+
423
+ @property
424
+ def atol_for_validation(self) -> float:
425
+ return 1e-4
426
+
427
+ def generate_dummy_inputs(
428
+ self,
429
+ processor: "ProcessorMixin",
430
+ batch_size: int = -1,
431
+ seq_length: int = -1,
432
+ framework: Optional["TensorType"] = None,
433
+ ) -> Mapping[str, Any]:
434
+ text_input_dict = super().generate_dummy_inputs(
435
+ processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
436
+ )
437
+ image_input_dict = super().generate_dummy_inputs(
438
+ processor.image_processor, batch_size=batch_size, framework=framework
439
+ )
440
+ return {**text_input_dict, **image_input_dict}
441
+
442
+ @property
443
+ def default_onnx_opset(self) -> int:
444
+ return 14
image_processing_siglip.py ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """Image processor class for SigLIP."""
16
+
17
+ from typing import Dict, Optional, Union
18
+
19
+ import numpy as np
20
+
21
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
22
+ from transformers.image_transforms import (
23
+ rescale,
24
+ resize,
25
+ to_channel_dimension_format,
26
+ )
27
+ from transformers.image_utils import (
28
+ ChannelDimension,
29
+ ImageInput,
30
+ PILImageResampling,
31
+ infer_channel_dimension_format,
32
+ is_scaled_image,
33
+ make_list_of_images,
34
+ to_numpy_array,
35
+ valid_images,
36
+ )
37
+ from transformers.utils import TensorType, is_vision_available, logging
38
+
39
+
40
+ logger = logging.get_logger(__name__)
41
+
42
+
43
+ if is_vision_available():
44
+ import PIL
45
+
46
+
47
+ class SiglipImageProcessor(BaseImageProcessor):
48
+ r"""
49
+ Constructs a SigLIP image processor.
50
+
51
+ Args:
52
+ do_resize (`bool`, *optional*, defaults to `True`):
53
+ Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
54
+ `do_resize` in the `preprocess` method.
55
+ size (`Dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
56
+ Size of the image after resizing. Can be overridden by `size` in the `preprocess` method.
57
+ resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
58
+ Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
59
+ do_rescale (`bool`, *optional*, defaults to `True`):
60
+ Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
61
+ the `preprocess` method.
62
+ rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
63
+ Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
64
+ method.
65
+ """
66
+
67
+ model_input_names = ["pixel_values"]
68
+
69
+ def __init__(
70
+ self,
71
+ do_resize: bool = True,
72
+ size: Dict[str, int] = None,
73
+ resample: PILImageResampling = PILImageResampling.BILINEAR,
74
+ do_rescale: bool = True,
75
+ rescale_factor: Union[int, float] = 1 / 255,
76
+ **kwargs,
77
+ ) -> None:
78
+ super().__init__(**kwargs)
79
+ size = size if size is not None else {"height": 224, "width": 224}
80
+ size = get_size_dict(size, default_to_square=False)
81
+
82
+ self.do_resize = do_resize
83
+ self.size = size
84
+ self.resample = resample
85
+ self.do_rescale = do_rescale
86
+ self.rescale_factor = rescale_factor
87
+
88
+ def rescale(
89
+ self,
90
+ image: np.ndarray,
91
+ rescale_factor: float,
92
+ data_format: Optional[Union[str, ChannelDimension]] = None,
93
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
94
+ **kwargs,
95
+ ) -> np.ndarray:
96
+ """
97
+ Rescale an image by a scale factor. image = image * scale, after which image = image * 2 - 1.
98
+
99
+ Args:
100
+ image (`np.ndarray`):
101
+ Image to rescale.
102
+ scale (`float`):
103
+ The scaling factor to rescale pixel values by.
104
+ data_format (`str` or `ChannelDimension`, *optional*):
105
+ The channel dimension format for the output image. If unset, the channel dimension format of the input
106
+ image is used. Can be one of:
107
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
108
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
109
+ input_data_format (`ChannelDimension` or `str`, *optional*):
110
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
111
+ from the input image. Can be one of:
112
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
113
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
114
+
115
+ Returns:
116
+ `np.ndarray`: The rescaled image.
117
+ """
118
+ # first, rescale to 0->1
119
+ rescaled_image = rescale(
120
+ image, scale=rescale_factor, data_format=data_format, input_data_format=input_data_format, **kwargs
121
+ )
122
+
123
+ # next, rescale to -1->1
124
+ rescaled_image = 2 * rescaled_image - 1
125
+
126
+ return rescaled_image
127
+
128
+ def preprocess(
129
+ self,
130
+ images: ImageInput,
131
+ do_resize: bool = None,
132
+ size: Dict[str, int] = None,
133
+ resample: PILImageResampling = None,
134
+ do_rescale: bool = None,
135
+ rescale_factor: float = None,
136
+ return_tensors: Optional[Union[str, TensorType]] = None,
137
+ data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
138
+ input_data_format: Optional[Union[str, ChannelDimension]] = None,
139
+ **kwargs,
140
+ ) -> PIL.Image.Image:
141
+ """
142
+ Preprocess an image or batch of images.
143
+
144
+ Args:
145
+ images (`ImageInput`):
146
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
147
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
148
+ do_resize (`bool`, *optional*, defaults to `self.do_resize`):
149
+ Whether to resize the image.
150
+ size (`Dict[str, int]`, *optional*, defaults to `self.size`):
151
+ Size of the image after resizing.
152
+ resample (`int`, *optional*, defaults to `self.resample`):
153
+ Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
154
+ has an effect if `do_resize` is set to `True`.
155
+ do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
156
+ Whether to rescale the image.
157
+ rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
158
+ Rescale factor to rescale the image by if `do_rescale` is set to `True`.
159
+ return_tensors (`str` or `TensorType`, *optional*):
160
+ The type of tensors to return. Can be one of:
161
+ - Unset: Return a list of `np.ndarray`.
162
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
163
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
164
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
165
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
166
+ data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
167
+ The channel dimension format for the output image. Can be one of:
168
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
169
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
170
+ - Unset: Use the channel dimension format of the input image.
171
+ input_data_format (`ChannelDimension` or `str`, *optional*):
172
+ The channel dimension format for the input image. If unset, the channel dimension format is inferred
173
+ from the input image. Can be one of:
174
+ - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
175
+ - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
176
+ - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
177
+ """
178
+ do_resize = do_resize if do_resize is not None else self.do_resize
179
+ size = size if size is not None else self.size
180
+ size = get_size_dict(size, param_name="size", default_to_square=False)
181
+ resample = resample if resample is not None else self.resample
182
+ do_rescale = do_rescale if do_rescale is not None else self.do_rescale
183
+ rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
184
+
185
+ images = make_list_of_images(images)
186
+
187
+ if not valid_images(images):
188
+ raise ValueError(
189
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
190
+ "torch.Tensor, tf.Tensor or jax.ndarray."
191
+ )
192
+
193
+ if do_resize and size is None:
194
+ raise ValueError("Size must be specified if do_resize is True.")
195
+
196
+ if do_rescale and rescale_factor is None:
197
+ raise ValueError("Rescale factor must be specified if do_rescale is True.")
198
+
199
+ # All transformations expect numpy arrays.
200
+ images = [to_numpy_array(image) for image in images]
201
+
202
+ if is_scaled_image(images[0]) and do_rescale:
203
+ logger.warning_once(
204
+ "It looks like you are trying to rescale already rescaled images. If the input"
205
+ " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
206
+ )
207
+
208
+ if input_data_format is None:
209
+ # We assume that all images have the same channel dimension format.
210
+ input_data_format = infer_channel_dimension_format(images[0])
211
+
212
+ if do_resize:
213
+ images = [
214
+ resize(image=image, size=(size["width"], size["height"]), resample=resample, input_data_format=input_data_format)
215
+ for image in images
216
+ ]
217
+
218
+ if do_rescale:
219
+ images = [
220
+ self.rescale(image=image, rescale_factor=rescale_factor, input_data_format=input_data_format)
221
+ for image in images
222
+ ]
223
+
224
+ images = [
225
+ to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
226
+ ]
227
+
228
+ data = {"pixel_values": images}
229
+ return BatchFeature(data=data, tensor_type=return_tensors)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:77b1761351ca205f553d40a66f68e1b0f1b10d6594b542f3e8d35582a6cc011f
3
+ size 25418576
modeling_siglip.py ADDED
@@ -0,0 +1,1158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Google AI and The HuggingFace Team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """ PyTorch Siglip model."""
16
+
17
+
18
+ from dataclasses import dataclass
19
+ from typing import Any, Optional, Tuple, Union
20
+
21
+ import torch
22
+ import torch.utils.checkpoint
23
+ from torch import nn
24
+
25
+ from transformers.activations import ACT2FN
26
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
27
+ from transformers.modeling_utils import PreTrainedModel
28
+ from transformers.utils import (
29
+ ModelOutput,
30
+ add_start_docstrings,
31
+ add_start_docstrings_to_model_forward,
32
+ logging,
33
+ replace_return_docstrings,
34
+ )
35
+ from .configuration_siglip import SiglipConfig, SiglipTextConfig, SiglipVisionConfig
36
+
37
+
38
+ logger = logging.get_logger(__name__)
39
+
40
+ _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
41
+
42
+ SIGLIP_PRETRAINED_MODEL_ARCHIVE_LIST = [
43
+ "google/siglip-base-patch16-224",
44
+ # See all SigLIP models at https://huggingface.co/models?filter=siglip
45
+ ]
46
+
47
+
48
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
49
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
50
+ """
51
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
52
+ """
53
+ bsz, src_len = mask.size()
54
+ tgt_len = tgt_len if tgt_len is not None else src_len
55
+
56
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
57
+
58
+ inverted_mask = 1.0 - expanded_mask
59
+
60
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
61
+
62
+
63
+ # contrastive loss function, adapted from
64
+ # https://sachinruk.github.io/blog/2021-03-07-siglip.html
65
+ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
66
+ return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
67
+
68
+
69
+ # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->siglip
70
+ def siglip_loss(similarity: torch.Tensor) -> torch.Tensor:
71
+ caption_loss = contrastive_loss(similarity)
72
+ image_loss = contrastive_loss(similarity.t())
73
+ return (caption_loss + image_loss) / 2.0
74
+
75
+
76
+ @dataclass
77
+ # Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Siglip
78
+ class SiglipVisionModelOutput(ModelOutput):
79
+ """
80
+ Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
81
+
82
+ Args:
83
+ image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
84
+ The image embeddings obtained by applying the projection layer to the pooler_output.
85
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
86
+ Sequence of hidden-states at the output of the last layer of the model.
87
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
88
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
89
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
90
+
91
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
92
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
93
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
94
+ sequence_length)`.
95
+
96
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
97
+ heads.
98
+ """
99
+
100
+ image_embeds: Optional[torch.FloatTensor] = None
101
+ last_hidden_state: torch.FloatTensor = None
102
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
103
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
104
+
105
+
106
+ @dataclass
107
+ # Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Siglip
108
+ class SiglipTextModelOutput(ModelOutput):
109
+ """
110
+ Base class for text model's outputs that also contains a pooling of the last hidden states.
111
+
112
+ Args:
113
+ text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
114
+ The text embeddings obtained by applying the projection layer to the pooler_output.
115
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
116
+ Sequence of hidden-states at the output of the last layer of the model.
117
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
118
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
119
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
120
+
121
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
122
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
123
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
124
+ sequence_length)`.
125
+
126
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
127
+ heads.
128
+ """
129
+
130
+ text_embeds: Optional[torch.FloatTensor] = None
131
+ last_hidden_state: torch.FloatTensor = None
132
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
133
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
134
+
135
+
136
+ @dataclass
137
+ # Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->Siglip
138
+ class SiglipOutput(ModelOutput):
139
+ """
140
+ Args:
141
+ loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
142
+ Contrastive loss for image-text similarity.
143
+ logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
144
+ The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
145
+ similarity scores.
146
+ logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
147
+ The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
148
+ similarity scores.
149
+ text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
150
+ The text embeddings obtained by applying the projection layer to the pooled output of [`SiglipTextModel`].
151
+ image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
152
+ The image embeddings obtained by applying the projection layer to the pooled output of
153
+ [`SiglipVisionModel`].
154
+ text_model_output(`BaseModelOutputWithPooling`):
155
+ The output of the [`SiglipTextModel`].
156
+ vision_model_output(`BaseModelOutputWithPooling`):
157
+ The output of the [`SiglipVisionModel`].
158
+ """
159
+
160
+ loss: Optional[torch.FloatTensor] = None
161
+ logits_per_image: torch.FloatTensor = None
162
+ logits_per_text: torch.FloatTensor = None
163
+ text_embeds: torch.FloatTensor = None
164
+ image_embeds: torch.FloatTensor = None
165
+ text_model_output: BaseModelOutputWithPooling = None
166
+ vision_model_output: BaseModelOutputWithPooling = None
167
+
168
+ def to_tuple(self) -> Tuple[Any]:
169
+ return tuple(
170
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
171
+ for k in self.keys()
172
+ )
173
+
174
+
175
+ class SiglipVisionEmbeddings(nn.Module):
176
+ def __init__(self, config: SiglipVisionConfig):
177
+ super().__init__()
178
+ self.config = config
179
+ self.embed_dim = config.hidden_size
180
+ self.image_size = config.image_size
181
+ self.patch_size = config.patch_size
182
+
183
+ self.patch_embedding = nn.Conv2d(
184
+ in_channels=config.num_channels,
185
+ out_channels=self.embed_dim,
186
+ kernel_size=self.patch_size,
187
+ stride=self.patch_size,
188
+ padding="valid",
189
+ )
190
+
191
+ self.num_patches = (self.image_size // self.patch_size) ** 2
192
+ self.num_positions = self.num_patches
193
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
194
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
195
+
196
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
197
+
198
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
199
+ embeddings = patch_embeds.flatten(2).transpose(1, 2)
200
+
201
+ embeddings = embeddings + self.position_embedding(self.position_ids)
202
+ return embeddings
203
+
204
+
205
+ # Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Siglip
206
+ class SiglipTextEmbeddings(nn.Module):
207
+ def __init__(self, config: SiglipTextConfig):
208
+ super().__init__()
209
+ embed_dim = config.hidden_size
210
+
211
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
212
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
213
+
214
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
215
+ self.register_buffer(
216
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
217
+ )
218
+
219
+ def forward(
220
+ self,
221
+ input_ids: Optional[torch.LongTensor] = None,
222
+ position_ids: Optional[torch.LongTensor] = None,
223
+ inputs_embeds: Optional[torch.FloatTensor] = None,
224
+ ) -> torch.Tensor:
225
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
226
+
227
+ if position_ids is None:
228
+ position_ids = self.position_ids[:, :seq_length]
229
+
230
+ if inputs_embeds is None:
231
+ inputs_embeds = self.token_embedding(input_ids)
232
+
233
+ position_embeddings = self.position_embedding(position_ids)
234
+ embeddings = inputs_embeds + position_embeddings
235
+
236
+ return embeddings
237
+
238
+
239
+ # Copied from transformers.models.clip.modeling_clip.CLIPAttention with CLIP->Siglip
240
+ class SiglipAttention(nn.Module):
241
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
242
+
243
+ def __init__(self, config):
244
+ super().__init__()
245
+ self.config = config
246
+ self.embed_dim = config.hidden_size
247
+ self.num_heads = config.num_attention_heads
248
+ self.head_dim = self.embed_dim // self.num_heads
249
+ if self.head_dim * self.num_heads != self.embed_dim:
250
+ raise ValueError(
251
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
252
+ f" {self.num_heads})."
253
+ )
254
+ self.scale = self.head_dim**-0.5
255
+ self.dropout = config.attention_dropout
256
+
257
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
258
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
259
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
260
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
261
+
262
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
263
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
264
+
265
+ def forward(
266
+ self,
267
+ hidden_states: torch.Tensor,
268
+ attention_mask: Optional[torch.Tensor] = None,
269
+ causal_attention_mask: Optional[torch.Tensor] = None,
270
+ output_attentions: Optional[bool] = False,
271
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
272
+ """Input shape: Batch x Time x Channel"""
273
+
274
+ bsz, tgt_len, embed_dim = hidden_states.size()
275
+
276
+ # get query proj
277
+ query_states = self.q_proj(hidden_states) * self.scale
278
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
279
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
280
+
281
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
282
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
283
+ key_states = key_states.view(*proj_shape)
284
+ value_states = value_states.view(*proj_shape)
285
+
286
+ src_len = key_states.size(1)
287
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
288
+
289
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
290
+ raise ValueError(
291
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
292
+ f" {attn_weights.size()}"
293
+ )
294
+
295
+ # apply the causal_attention_mask first
296
+ if causal_attention_mask is not None:
297
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
298
+ raise ValueError(
299
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
300
+ f" {causal_attention_mask.size()}"
301
+ )
302
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
303
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
304
+
305
+ if attention_mask is not None:
306
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
307
+ raise ValueError(
308
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
309
+ )
310
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
311
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
312
+
313
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
314
+
315
+ if output_attentions:
316
+ # this operation is a bit akward, but it's required to
317
+ # make sure that attn_weights keeps its gradient.
318
+ # In order to do so, attn_weights have to reshaped
319
+ # twice and have to be reused in the following
320
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
321
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
322
+ else:
323
+ attn_weights_reshaped = None
324
+
325
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
326
+
327
+ attn_output = torch.bmm(attn_probs, value_states)
328
+
329
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
330
+ raise ValueError(
331
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
332
+ f" {attn_output.size()}"
333
+ )
334
+
335
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
336
+ attn_output = attn_output.transpose(1, 2)
337
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
338
+
339
+ attn_output = self.out_proj(attn_output)
340
+
341
+ return attn_output, attn_weights_reshaped
342
+
343
+
344
+ # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Siglip
345
+ class SiglipMLP(nn.Module):
346
+ def __init__(self, config):
347
+ super().__init__()
348
+ self.config = config
349
+ self.activation_fn = ACT2FN[config.hidden_act]
350
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
351
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
352
+
353
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
354
+ hidden_states = self.fc1(hidden_states)
355
+ hidden_states = self.activation_fn(hidden_states)
356
+ hidden_states = self.fc2(hidden_states)
357
+ return hidden_states
358
+
359
+
360
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Siglip
361
+ class SiglipEncoderLayer(nn.Module):
362
+ def __init__(self, config: SiglipConfig):
363
+ super().__init__()
364
+ self.embed_dim = config.hidden_size
365
+ self.self_attn = SiglipAttention(config)
366
+ self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
367
+ self.mlp = SiglipMLP(config)
368
+ self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
369
+
370
+ def forward(
371
+ self,
372
+ hidden_states: torch.Tensor,
373
+ attention_mask: torch.Tensor,
374
+ causal_attention_mask: torch.Tensor,
375
+ output_attentions: Optional[bool] = False,
376
+ ) -> Tuple[torch.FloatTensor]:
377
+ """
378
+ Args:
379
+ hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
380
+ attention_mask (`torch.FloatTensor`): attention mask of size
381
+ `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
382
+ `(config.encoder_attention_heads,)`.
383
+ output_attentions (`bool`, *optional*):
384
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
385
+ returned tensors for more detail.
386
+ """
387
+ residual = hidden_states
388
+
389
+ hidden_states = self.layer_norm1(hidden_states)
390
+ hidden_states, attn_weights = self.self_attn(
391
+ hidden_states=hidden_states,
392
+ attention_mask=attention_mask,
393
+ causal_attention_mask=causal_attention_mask,
394
+ output_attentions=output_attentions,
395
+ )
396
+ hidden_states = residual + hidden_states
397
+
398
+ residual = hidden_states
399
+ hidden_states = self.layer_norm2(hidden_states)
400
+ hidden_states = self.mlp(hidden_states)
401
+ hidden_states = residual + hidden_states
402
+
403
+ outputs = (hidden_states,)
404
+
405
+ if output_attentions:
406
+ outputs += (attn_weights,)
407
+
408
+ return outputs
409
+
410
+
411
+ class SiglipPreTrainedModel(PreTrainedModel):
412
+ """
413
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
414
+ models.
415
+ """
416
+
417
+ config_class = SiglipConfig
418
+ base_model_prefix = "siglip"
419
+ supports_gradient_checkpointing = True
420
+
421
+ def _init_weights(self, module):
422
+ """Initialize the weights"""
423
+ factor = self.config.initializer_factor
424
+ if isinstance(module, SiglipTextEmbeddings):
425
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
426
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
427
+ elif isinstance(module, SiglipVisionEmbeddings):
428
+ factor = self.config.initializer_factor
429
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
430
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
431
+ elif isinstance(module, SiglipAttention):
432
+ factor = self.config.initializer_factor
433
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
434
+ out_proj_std = (module.embed_dim**-0.5) * factor
435
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
436
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
437
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
438
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
439
+ elif isinstance(module, SiglipMLP):
440
+ factor = self.config.initializer_factor
441
+ in_proj_std = (
442
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
443
+ )
444
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
445
+ nn.init.normal_(module.fc1.weight, std=fc_std)
446
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
447
+ if isinstance(module, nn.LayerNorm):
448
+ module.bias.data.zero_()
449
+ module.weight.data.fill_(1.0)
450
+ if isinstance(module, nn.Linear) and module.bias is not None:
451
+ module.bias.data.zero_()
452
+
453
+ def _set_gradient_checkpointing(self, module, value=False):
454
+ if isinstance(module, SiglipEncoder):
455
+ module.gradient_checkpointing = value
456
+
457
+
458
+ SIGLIP_START_DOCSTRING = r"""
459
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
460
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
461
+ etc.)
462
+
463
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
464
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
465
+ and behavior.
466
+
467
+ Parameters:
468
+ config ([`SiglipConfig`]): Model configuration class with all the parameters of the model.
469
+ Initializing with a config file does not load the weights associated with the model, only the
470
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
471
+ """
472
+
473
+ SIGLIP_TEXT_INPUTS_DOCSTRING = r"""
474
+ Args:
475
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
476
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
477
+ it.
478
+
479
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
480
+ [`PreTrainedTokenizer.__call__`] for details.
481
+
482
+ [What are input IDs?](../glossary#input-ids)
483
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
484
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
485
+
486
+ - 1 for tokens that are **not masked**,
487
+ - 0 for tokens that are **masked**.
488
+
489
+ [What are attention masks?](../glossary#attention-mask)
490
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
491
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
492
+ config.max_position_embeddings - 1]`.
493
+
494
+ [What are position IDs?](../glossary#position-ids)
495
+ output_attentions (`bool`, *optional*):
496
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
497
+ tensors for more detail.
498
+ output_hidden_states (`bool`, *optional*):
499
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
500
+ more detail.
501
+ return_dict (`bool`, *optional*):
502
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
503
+ """
504
+
505
+ SIGLIP_VISION_INPUTS_DOCSTRING = r"""
506
+ Args:
507
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
508
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
509
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
510
+ output_attentions (`bool`, *optional*):
511
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
512
+ tensors for more detail.
513
+ output_hidden_states (`bool`, *optional*):
514
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
515
+ more detail.
516
+ return_dict (`bool`, *optional*):
517
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
518
+ """
519
+
520
+ SIGLIP_INPUTS_DOCSTRING = r"""
521
+ Args:
522
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
523
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
524
+ it.
525
+
526
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
527
+ [`PreTrainedTokenizer.__call__`] for details.
528
+
529
+ [What are input IDs?](../glossary#input-ids)
530
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
531
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
532
+
533
+ - 1 for tokens that are **not masked**,
534
+ - 0 for tokens that are **masked**.
535
+
536
+ [What are attention masks?](../glossary#attention-mask)
537
+ position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
538
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
539
+ config.max_position_embeddings - 1]`.
540
+
541
+ [What are position IDs?](../glossary#position-ids)
542
+ pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
543
+ Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
544
+ [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details.
545
+ return_loss (`bool`, *optional*):
546
+ Whether or not to return the contrastive loss.
547
+ output_attentions (`bool`, *optional*):
548
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
549
+ tensors for more detail.
550
+ output_hidden_states (`bool`, *optional*):
551
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
552
+ more detail.
553
+ return_dict (`bool`, *optional*):
554
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
555
+ """
556
+
557
+
558
+ # Copied from transformers.models.clip.modeling_clip.CLIPEncoder with CLIP->Siglip
559
+ class SiglipEncoder(nn.Module):
560
+ """
561
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
562
+ [`SiglipEncoderLayer`].
563
+
564
+ Args:
565
+ config: SiglipConfig
566
+ """
567
+
568
+ def __init__(self, config: SiglipConfig):
569
+ super().__init__()
570
+ self.config = config
571
+ self.layers = nn.ModuleList([SiglipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
572
+ self.gradient_checkpointing = False
573
+
574
+ def forward(
575
+ self,
576
+ inputs_embeds,
577
+ attention_mask: Optional[torch.Tensor] = None,
578
+ causal_attention_mask: Optional[torch.Tensor] = None,
579
+ output_attentions: Optional[bool] = None,
580
+ output_hidden_states: Optional[bool] = None,
581
+ return_dict: Optional[bool] = None,
582
+ ) -> Union[Tuple, BaseModelOutput]:
583
+ r"""
584
+ Args:
585
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
586
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
587
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
588
+ than the model's internal embedding lookup matrix.
589
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
590
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
591
+
592
+ - 1 for tokens that are **not masked**,
593
+ - 0 for tokens that are **masked**.
594
+
595
+ [What are attention masks?](../glossary#attention-mask)
596
+ causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
597
+ Causal mask for the text model. Mask values selected in `[0, 1]`:
598
+
599
+ - 1 for tokens that are **not masked**,
600
+ - 0 for tokens that are **masked**.
601
+
602
+ [What are attention masks?](../glossary#attention-mask)
603
+ output_attentions (`bool`, *optional*):
604
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
605
+ returned tensors for more detail.
606
+ output_hidden_states (`bool`, *optional*):
607
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
608
+ for more detail.
609
+ return_dict (`bool`, *optional*):
610
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
611
+ """
612
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
613
+ output_hidden_states = (
614
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
615
+ )
616
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
617
+
618
+ encoder_states = () if output_hidden_states else None
619
+ all_attentions = () if output_attentions else None
620
+
621
+ hidden_states = inputs_embeds
622
+ for idx, encoder_layer in enumerate(self.layers):
623
+ if output_hidden_states:
624
+ encoder_states = encoder_states + (hidden_states,)
625
+ if self.gradient_checkpointing and self.training:
626
+
627
+ def create_custom_forward(module):
628
+ def custom_forward(*inputs):
629
+ return module(*inputs, output_attentions)
630
+
631
+ return custom_forward
632
+
633
+ layer_outputs = torch.utils.checkpoint.checkpoint(
634
+ create_custom_forward(encoder_layer),
635
+ hidden_states,
636
+ attention_mask,
637
+ causal_attention_mask,
638
+ )
639
+ else:
640
+ layer_outputs = encoder_layer(
641
+ hidden_states,
642
+ attention_mask,
643
+ causal_attention_mask,
644
+ output_attentions=output_attentions,
645
+ )
646
+
647
+ hidden_states = layer_outputs[0]
648
+
649
+ if output_attentions:
650
+ all_attentions = all_attentions + (layer_outputs[1],)
651
+
652
+ if output_hidden_states:
653
+ encoder_states = encoder_states + (hidden_states,)
654
+
655
+ if not return_dict:
656
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
657
+ return BaseModelOutput(
658
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
659
+ )
660
+
661
+
662
+ class SiglipTextTransformer(nn.Module):
663
+ def __init__(self, config: SiglipTextConfig):
664
+ super().__init__()
665
+ self.config = config
666
+ embed_dim = config.hidden_size
667
+ self.embeddings = SiglipTextEmbeddings(config)
668
+ self.encoder = SiglipEncoder(config)
669
+ self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
670
+
671
+ self.head = nn.Linear(embed_dim, embed_dim)
672
+
673
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
674
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
675
+ def forward(
676
+ self,
677
+ input_ids: Optional[torch.Tensor] = None,
678
+ attention_mask: Optional[torch.Tensor] = None,
679
+ position_ids: Optional[torch.Tensor] = None,
680
+ output_attentions: Optional[bool] = None,
681
+ output_hidden_states: Optional[bool] = None,
682
+ return_dict: Optional[bool] = None,
683
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
684
+ r"""
685
+ Returns:
686
+
687
+ """
688
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
689
+ output_hidden_states = (
690
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
691
+ )
692
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
693
+
694
+ if input_ids is None:
695
+ raise ValueError("You have to specify input_ids")
696
+
697
+ input_shape = input_ids.size()
698
+ input_ids = input_ids.view(-1, input_shape[-1])
699
+
700
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
701
+
702
+ # note: SigLIP's text model does not use q causal mask, unlike the original CLIP model.
703
+ # expand attention_mask
704
+ if attention_mask is not None:
705
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
706
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
707
+
708
+ encoder_outputs = self.encoder(
709
+ inputs_embeds=hidden_states,
710
+ attention_mask=None,
711
+ causal_attention_mask=None,
712
+ output_attentions=output_attentions,
713
+ output_hidden_states=output_hidden_states,
714
+ return_dict=return_dict,
715
+ )
716
+
717
+ last_hidden_state = encoder_outputs[0]
718
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
719
+
720
+ # Assuming "sticky" EOS tokenization, last token is always EOS.
721
+ pooled_output = last_hidden_state[:, -1, :]
722
+ pooled_output = self.head(pooled_output)
723
+
724
+ if not return_dict:
725
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
726
+
727
+ return BaseModelOutputWithPooling(
728
+ last_hidden_state=last_hidden_state,
729
+ pooler_output=pooled_output,
730
+ hidden_states=encoder_outputs.hidden_states,
731
+ attentions=encoder_outputs.attentions,
732
+ )
733
+
734
+
735
+ @add_start_docstrings(
736
+ """The text model from SigLIP without any head or projection on top.""",
737
+ SIGLIP_START_DOCSTRING,
738
+ )
739
+ class SiglipTextModel(SiglipPreTrainedModel):
740
+ config_class = SiglipTextConfig
741
+
742
+ _no_split_modules = ["SiglipTextEmbeddings", "SiglipEncoderLayer"]
743
+
744
+ def __init__(self, config: SiglipTextConfig):
745
+ super().__init__(config)
746
+ self.text_model = SiglipTextTransformer(config)
747
+ # Initialize weights and apply final processing
748
+ self.post_init()
749
+
750
+ def get_input_embeddings(self) -> nn.Module:
751
+ return self.text_model.embeddings.token_embedding
752
+
753
+ def set_input_embeddings(self, value):
754
+ self.text_model.embeddings.token_embedding = value
755
+
756
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
757
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipTextConfig)
758
+ def forward(
759
+ self,
760
+ input_ids: Optional[torch.Tensor] = None,
761
+ attention_mask: Optional[torch.Tensor] = None,
762
+ position_ids: Optional[torch.Tensor] = None,
763
+ output_attentions: Optional[bool] = None,
764
+ output_hidden_states: Optional[bool] = None,
765
+ return_dict: Optional[bool] = None,
766
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
767
+ r"""
768
+ Returns:
769
+
770
+ Examples:
771
+
772
+ ```python
773
+ >>> from transformers import AutoTokenizer, SiglipTextModel
774
+
775
+ >>> model = SiglipTextModel.from_pretrained("google/siglip-base-patch16-224")
776
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
777
+
778
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
779
+
780
+ >>> outputs = model(**inputs)
781
+ >>> last_hidden_state = outputs.last_hidden_state
782
+ >>> pooled_output = outputs.pooler_output # pooled (EOS token) states
783
+ ```"""
784
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
785
+
786
+ return self.text_model(
787
+ input_ids=input_ids,
788
+ attention_mask=attention_mask,
789
+ position_ids=position_ids,
790
+ output_attentions=output_attentions,
791
+ output_hidden_states=output_hidden_states,
792
+ return_dict=return_dict,
793
+ )
794
+
795
+
796
+ class SiglipVisionTransformer(nn.Module):
797
+ def __init__(self, config: SiglipVisionConfig):
798
+ super().__init__()
799
+ self.config = config
800
+ embed_dim = config.hidden_size
801
+
802
+ self.embeddings = SiglipVisionEmbeddings(config)
803
+ self.encoder = SiglipEncoder(config)
804
+ self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
805
+ self.head = SiglipMultiheadAttentionPoolingHead(config)
806
+
807
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
808
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
809
+ def forward(
810
+ self,
811
+ pixel_values,
812
+ output_attentions: Optional[bool] = None,
813
+ output_hidden_states: Optional[bool] = None,
814
+ return_dict: Optional[bool] = None,
815
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
816
+ r"""
817
+ Returns:
818
+
819
+ """
820
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
821
+ output_hidden_states = (
822
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
823
+ )
824
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
825
+
826
+ hidden_states = self.embeddings(pixel_values)
827
+
828
+ encoder_outputs = self.encoder(
829
+ inputs_embeds=hidden_states,
830
+ output_attentions=output_attentions,
831
+ output_hidden_states=output_hidden_states,
832
+ return_dict=return_dict,
833
+ )
834
+
835
+ last_hidden_state = encoder_outputs[0]
836
+ last_hidden_state = self.post_layernorm(last_hidden_state)
837
+
838
+
839
+ pooled_output = self.head(last_hidden_state)
840
+
841
+ if not return_dict:
842
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
843
+
844
+ return BaseModelOutputWithPooling(
845
+ last_hidden_state=last_hidden_state,
846
+ pooler_output=pooled_output,
847
+ hidden_states=encoder_outputs.hidden_states,
848
+ attentions=encoder_outputs.attentions,
849
+ )
850
+
851
+
852
+ class SiglipMultiheadAttentionPoolingHead(nn.Module):
853
+ """Multihead Attention Pooling."""
854
+
855
+ def __init__(self, config: SiglipVisionConfig):
856
+ super().__init__()
857
+
858
+ self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
859
+ self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
860
+ self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
861
+ self.mlp = SiglipMLP(config)
862
+
863
+ def forward(self, hidden_state):
864
+ batch_size = hidden_state.shape[0]
865
+ probe = self.probe.repeat(batch_size, 1, 1)
866
+
867
+ hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
868
+
869
+ residual = hidden_state
870
+ hidden_state = self.layernorm(hidden_state)
871
+ hidden_state = residual + self.mlp(hidden_state)
872
+
873
+ return hidden_state[:, 0]
874
+
875
+
876
+ @add_start_docstrings(
877
+ """The vision model from SigLIP without any head or projection on top.""",
878
+ SIGLIP_START_DOCSTRING,
879
+ )
880
+ class SiglipVisionModel(SiglipPreTrainedModel):
881
+ config_class = SiglipVisionConfig
882
+ main_input_name = "pixel_values"
883
+
884
+ def __init__(self, config: SiglipVisionConfig):
885
+ super().__init__(config)
886
+
887
+ self.vision_model = SiglipVisionTransformer(config)
888
+
889
+ # Initialize weights and apply final processing
890
+ self.post_init()
891
+
892
+ def get_input_embeddings(self) -> nn.Module:
893
+ return self.vision_model.embeddings.patch_embedding
894
+
895
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
896
+ @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=SiglipVisionConfig)
897
+ def forward(
898
+ self,
899
+ pixel_values,
900
+ output_attentions: Optional[bool] = None,
901
+ output_hidden_states: Optional[bool] = None,
902
+ return_dict: Optional[bool] = None,
903
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
904
+ r"""
905
+ Returns:
906
+
907
+ Examples:
908
+
909
+ ```python
910
+ >>> from PIL import Image
911
+ >>> import requests
912
+ >>> from transformers import AutoProcessor, SiglipVisionModel
913
+
914
+ >>> model = SiglipVisionModel.from_pretrained("google/siglip-base-patch16-224")
915
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
916
+
917
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
918
+ >>> image = Image.open(requests.get(url, stream=True).raw)
919
+
920
+ >>> inputs = processor(images=image, return_tensors="pt")
921
+
922
+ >>> outputs = model(**inputs)
923
+ >>> last_hidden_state = outputs.last_hidden_state
924
+ >>> pooled_output = outputs.pooler_output # pooled CLS states
925
+ ```"""
926
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
927
+
928
+ return self.vision_model(
929
+ pixel_values=pixel_values,
930
+ output_attentions=output_attentions,
931
+ output_hidden_states=output_hidden_states,
932
+ return_dict=return_dict,
933
+ )
934
+
935
+
936
+ @add_start_docstrings(SIGLIP_START_DOCSTRING)
937
+ class SiglipModel(SiglipPreTrainedModel):
938
+ config_class = SiglipConfig
939
+
940
+ def __init__(self, config: SiglipConfig):
941
+ super().__init__(config)
942
+
943
+ if not isinstance(config.text_config, SiglipTextConfig):
944
+ raise ValueError(
945
+ "config.text_config is expected to be of type SiglipTextConfig but is of type"
946
+ f" {type(config.text_config)}."
947
+ )
948
+
949
+ if not isinstance(config.vision_config, SiglipVisionConfig):
950
+ raise ValueError(
951
+ "config.vision_config is expected to be of type SiglipVisionConfig but is of type"
952
+ f" {type(config.vision_config)}."
953
+ )
954
+
955
+ text_config = config.text_config
956
+ vision_config = config.vision_config
957
+
958
+ self.text_model = SiglipTextModel(text_config)
959
+ self.vision_model = SiglipVisionModel(vision_config)
960
+
961
+ self.temperature = nn.Parameter(
962
+ torch.randn(
963
+ 1,
964
+ )
965
+ )
966
+ self.bias = nn.Parameter(
967
+ torch.randn(
968
+ 1,
969
+ )
970
+ )
971
+
972
+ # Initialize weights and apply final processing
973
+ self.post_init()
974
+
975
+ @add_start_docstrings_to_model_forward(SIGLIP_TEXT_INPUTS_DOCSTRING)
976
+ def get_text_features(
977
+ self,
978
+ input_ids: Optional[torch.Tensor] = None,
979
+ attention_mask: Optional[torch.Tensor] = None,
980
+ position_ids: Optional[torch.Tensor] = None,
981
+ output_attentions: Optional[bool] = None,
982
+ output_hidden_states: Optional[bool] = None,
983
+ return_dict: Optional[bool] = None,
984
+ ) -> torch.FloatTensor:
985
+ r"""
986
+ Returns:
987
+ text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
988
+ applying the projection layer to the pooled output of [`SiglipTextModel`].
989
+
990
+ Examples:
991
+
992
+ ```python
993
+ >>> from transformers import AutoTokenizer, SiglipModel
994
+
995
+ >>> model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")
996
+ >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip-base-patch16-224")
997
+
998
+ >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
999
+ >>> text_features = model.get_text_features(**inputs)
1000
+ ```"""
1001
+ # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
1002
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1003
+ output_hidden_states = (
1004
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1005
+ )
1006
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1007
+
1008
+ text_outputs = self.text_model(
1009
+ input_ids=input_ids,
1010
+ attention_mask=attention_mask,
1011
+ position_ids=position_ids,
1012
+ output_attentions=output_attentions,
1013
+ output_hidden_states=output_hidden_states,
1014
+ return_dict=return_dict,
1015
+ )
1016
+
1017
+ pooled_output = text_outputs[1]
1018
+
1019
+ return pooled_output
1020
+
1021
+ @add_start_docstrings_to_model_forward(SIGLIP_VISION_INPUTS_DOCSTRING)
1022
+ def get_image_features(
1023
+ self,
1024
+ pixel_values: Optional[torch.FloatTensor] = None,
1025
+ output_attentions: Optional[bool] = None,
1026
+ output_hidden_states: Optional[bool] = None,
1027
+ return_dict: Optional[bool] = None,
1028
+ ) -> torch.FloatTensor:
1029
+ r"""
1030
+ Returns:
1031
+ image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
1032
+ applying the projection layer to the pooled output of [`SiglipVisionModel`].
1033
+
1034
+ Examples:
1035
+
1036
+ ```python
1037
+ >>> from PIL import Image
1038
+ >>> import requests
1039
+ >>> from transformers import AutoProcessor, SiglipModel
1040
+
1041
+ >>> model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")
1042
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
1043
+
1044
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1045
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1046
+
1047
+ >>> inputs = processor(images=image, return_tensors="pt")
1048
+
1049
+ >>> image_features = model.get_image_features(**inputs)
1050
+ ```"""
1051
+ # Use SiglipModel's config for some fields (if specified) instead of those of vision & text components.
1052
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1053
+ output_hidden_states = (
1054
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1055
+ )
1056
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1057
+
1058
+ vision_outputs = self.vision_model(
1059
+ pixel_values=pixel_values,
1060
+ output_attentions=output_attentions,
1061
+ output_hidden_states=output_hidden_states,
1062
+ return_dict=return_dict,
1063
+ )
1064
+
1065
+ pooled_output = vision_outputs[1]
1066
+
1067
+ return pooled_output
1068
+
1069
+ @add_start_docstrings_to_model_forward(SIGLIP_INPUTS_DOCSTRING)
1070
+ @replace_return_docstrings(output_type=SiglipOutput, config_class=SiglipConfig)
1071
+ def forward(
1072
+ self,
1073
+ input_ids: Optional[torch.LongTensor] = None,
1074
+ pixel_values: Optional[torch.FloatTensor] = None,
1075
+ attention_mask: Optional[torch.Tensor] = None,
1076
+ position_ids: Optional[torch.LongTensor] = None,
1077
+ return_loss: Optional[bool] = None,
1078
+ output_attentions: Optional[bool] = None,
1079
+ output_hidden_states: Optional[bool] = None,
1080
+ return_dict: Optional[bool] = None,
1081
+ ) -> Union[Tuple, SiglipOutput]:
1082
+ r"""
1083
+ Returns:
1084
+
1085
+ Examples:
1086
+
1087
+ ```python
1088
+ >>> from PIL import Image
1089
+ >>> import requests
1090
+ >>> from transformers import AutoProcessor, SiglipModel
1091
+
1092
+ >>> model = SiglipModel.from_pretrained("google/siglip-base-patch16-224")
1093
+ >>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
1094
+
1095
+ >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
1096
+ >>> image = Image.open(requests.get(url, stream=True).raw)
1097
+
1098
+ >>> inputs = processor(
1099
+ ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
1100
+ ... )
1101
+
1102
+ >>> outputs = model(**inputs)
1103
+ >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
1104
+ >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
1105
+ ```"""
1106
+ # Use SigLIP model's config for some fields (if specified) instead of those of vision & text components.
1107
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1108
+ output_hidden_states = (
1109
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1110
+ )
1111
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1112
+
1113
+ vision_outputs = self.vision_model(
1114
+ pixel_values=pixel_values,
1115
+ output_attentions=output_attentions,
1116
+ output_hidden_states=output_hidden_states,
1117
+ return_dict=return_dict,
1118
+ )
1119
+
1120
+ text_outputs = self.text_model(
1121
+ input_ids=input_ids,
1122
+ attention_mask=attention_mask,
1123
+ position_ids=position_ids,
1124
+ output_attentions=output_attentions,
1125
+ output_hidden_states=output_hidden_states,
1126
+ return_dict=return_dict,
1127
+ )
1128
+
1129
+ image_embeds = vision_outputs[1]
1130
+ text_embeds = text_outputs[1]
1131
+
1132
+ # normalized features
1133
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
1134
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
1135
+
1136
+ # cosine similarity as logits
1137
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * self.temperature.exp() + self.bias
1138
+ logits_per_image = logits_per_text.t()
1139
+
1140
+ z = torch.matmul(image_embeds, text_embeds.t()) * self.temperature.exp()
1141
+
1142
+ loss = None
1143
+ if return_loss:
1144
+ raise NotImplementedError("SigLIP loss to be implemented")
1145
+
1146
+ if not return_dict:
1147
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
1148
+ return ((loss,) + output) if loss is not None else output
1149
+
1150
+ return SiglipOutput(
1151
+ loss=loss,
1152
+ logits_per_image=logits_per_image,
1153
+ logits_per_text=logits_per_text,
1154
+ text_embeds=text_embeds,
1155
+ image_embeds=image_embeds,
1156
+ text_model_output=text_outputs,
1157
+ vision_model_output=vision_outputs,
1158
+ )