Gengzigang commited on
Commit
a7c1a46
1 Parent(s): 1b9deef
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ model.safetensors filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -37,7 +37,6 @@ import torch
37
 
38
  image_path = "CLIP.png"
39
  model_name_or_path = "LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
40
- image_size =336
41
 
42
  processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
43
  model = AutoModel.from_pretrained(
 
37
 
38
  image_path = "CLIP.png"
39
  model_name_or_path = "LLM2CLIP-Openai-L-14-336" # or /path/to/local/LLM2CLIP-Openai-L-14-336
 
40
 
41
  processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14-336")
42
  model = AutoModel.from_pretrained(
config.json CHANGED
@@ -1,97 +1,16 @@
1
  {
2
- "_commit_hash": null,
3
- "_name_or_path": "LLM2CLIP-Openai-L-14",
4
  "architectures": [
5
- "CLIPModel"
6
  ],
7
  "auto_map": {
8
  "AutoConfig": "configuration_clip.CLIPConfig",
9
- "AutoModel": "modeling_clip.CLIPModel"
10
  },
11
  "initializer_factor": 1.0,
12
  "logit_scale_init_value": 2.6592,
13
  "model_type": "clip",
14
  "projection_dim": 1280,
15
- "text_config": {
16
- "_name_or_path": "",
17
- "add_cross_attention": false,
18
- "architectures": null,
19
- "attention_dropout": 0.0,
20
- "bad_words_ids": null,
21
- "begin_suppress_tokens": null,
22
- "bos_token_id": 0,
23
- "chunk_size_feed_forward": 0,
24
- "cross_attention_hidden_size": null,
25
- "decoder_start_token_id": null,
26
- "diversity_penalty": 0.0,
27
- "do_sample": false,
28
- "early_stopping": false,
29
- "encoder_no_repeat_ngram_size": 0,
30
- "eos_token_id": 2,
31
- "exponential_decay_length_penalty": null,
32
- "finetuning_task": null,
33
- "forced_bos_token_id": null,
34
- "forced_eos_token_id": null,
35
- "hidden_act": "gelu",
36
- "hidden_size": 512,
37
- "id2label": {
38
- "0": "LABEL_0",
39
- "1": "LABEL_1"
40
- },
41
- "initializer_factor": 1.0,
42
- "initializer_range": 0.02,
43
- "intermediate_size": 2048,
44
- "is_decoder": false,
45
- "is_encoder_decoder": false,
46
- "k_bias": true,
47
- "label2id": {
48
- "LABEL_0": 0,
49
- "LABEL_1": 1
50
- },
51
- "layer_norm_eps": 1e-05,
52
- "length_penalty": 1.0,
53
- "max_length": 20,
54
- "max_position_embeddings": 77,
55
- "min_length": 0,
56
- "model_type": "clip_text_model",
57
- "no_repeat_ngram_size": 0,
58
- "num_attention_heads": 8,
59
- "num_beam_groups": 1,
60
- "num_beams": 1,
61
- "num_hidden_layers": 12,
62
- "num_return_sequences": 1,
63
- "output_attentions": false,
64
- "output_hidden_states": false,
65
- "output_scores": false,
66
- "pad_token_id": 1,
67
- "post_layernorm": false,
68
- "prefix": null,
69
- "problem_type": null,
70
- "projection_dim": 512,
71
- "pruned_heads": {},
72
- "q_bias": true,
73
- "remove_invalid_values": false,
74
- "repetition_penalty": 1.0,
75
- "return_dict": true,
76
- "return_dict_in_generate": false,
77
- "sep_token_id": null,
78
- "suppress_tokens": null,
79
- "task_specific_params": null,
80
- "temperature": 1.0,
81
- "tf_legacy_loss": false,
82
- "tie_encoder_decoder": false,
83
- "tie_word_embeddings": true,
84
- "tokenizer_class": null,
85
- "top_k": 50,
86
- "top_p": 1.0,
87
- "torch_dtype": null,
88
- "torchscript": false,
89
- "transformers_version": "4.44.2",
90
- "typical_p": 1.0,
91
- "use_bfloat16": false,
92
- "v_bias": true,
93
- "vocab_size": 49408
94
- },
95
  "torch_dtype": "float32",
96
  "transformers_version": null,
97
  "vision_config": {
@@ -100,7 +19,6 @@
100
  "architectures": null,
101
  "attention_dropout": 0.0,
102
  "bad_words_ids": null,
103
- "begin_suppress_tokens": null,
104
  "bos_token_id": null,
105
  "chunk_size_feed_forward": 0,
106
  "cross_attention_hidden_size": null,
@@ -115,7 +33,7 @@
115
  "finetuning_task": null,
116
  "forced_bos_token_id": null,
117
  "forced_eos_token_id": null,
118
- "hidden_act": "gelu",
119
  "hidden_size": 1024,
120
  "id2label": {
121
  "0": "LABEL_0",
@@ -127,7 +45,6 @@
127
  "intermediate_size": 4096,
128
  "is_decoder": false,
129
  "is_encoder_decoder": false,
130
- "k_bias": true,
131
  "label2id": {
132
  "LABEL_0": 0,
133
  "LABEL_1": 1
@@ -149,18 +66,15 @@
149
  "output_scores": false,
150
  "pad_token_id": null,
151
  "patch_size": 14,
152
- "post_layernorm": false,
153
  "prefix": null,
154
  "problem_type": null,
155
- "projection_dim": 768,
156
  "pruned_heads": {},
157
- "q_bias": true,
158
  "remove_invalid_values": false,
159
  "repetition_penalty": 1.0,
160
  "return_dict": true,
161
  "return_dict_in_generate": false,
162
  "sep_token_id": null,
163
- "suppress_tokens": null,
164
  "task_specific_params": null,
165
  "temperature": 1.0,
166
  "tf_legacy_loss": false,
@@ -171,9 +85,17 @@
171
  "top_p": 1.0,
172
  "torch_dtype": null,
173
  "torchscript": false,
174
- "transformers_version": "4.44.2",
175
  "typical_p": 1.0,
176
- "use_bfloat16": false,
177
- "v_bias": true
 
 
 
 
 
 
 
 
178
  }
179
- }
 
1
  {
2
+ "_name_or_path": "LLM2CLIP-Openai-L-14-336",
 
3
  "architectures": [
4
+ "LLM2CLIPModel"
5
  ],
6
  "auto_map": {
7
  "AutoConfig": "configuration_clip.CLIPConfig",
8
+ "AutoModel": "modeling_clip.LLM2CLIPModel"
9
  },
10
  "initializer_factor": 1.0,
11
  "logit_scale_init_value": 2.6592,
12
  "model_type": "clip",
13
  "projection_dim": 1280,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "torch_dtype": "float32",
15
  "transformers_version": null,
16
  "vision_config": {
 
19
  "architectures": null,
20
  "attention_dropout": 0.0,
21
  "bad_words_ids": null,
 
22
  "bos_token_id": null,
23
  "chunk_size_feed_forward": 0,
24
  "cross_attention_hidden_size": null,
 
33
  "finetuning_task": null,
34
  "forced_bos_token_id": null,
35
  "forced_eos_token_id": null,
36
+ "hidden_act": "quick_gelu",
37
  "hidden_size": 1024,
38
  "id2label": {
39
  "0": "LABEL_0",
 
45
  "intermediate_size": 4096,
46
  "is_decoder": false,
47
  "is_encoder_decoder": false,
 
48
  "label2id": {
49
  "LABEL_0": 0,
50
  "LABEL_1": 1
 
66
  "output_scores": false,
67
  "pad_token_id": null,
68
  "patch_size": 14,
 
69
  "prefix": null,
70
  "problem_type": null,
71
+ "projection_dim": 1280,
72
  "pruned_heads": {},
 
73
  "remove_invalid_values": false,
74
  "repetition_penalty": 1.0,
75
  "return_dict": true,
76
  "return_dict_in_generate": false,
77
  "sep_token_id": null,
 
78
  "task_specific_params": null,
79
  "temperature": 1.0,
80
  "tf_legacy_loss": false,
 
85
  "top_p": 1.0,
86
  "torch_dtype": null,
87
  "torchscript": false,
88
+ "transformers_version": "4.21.3",
89
  "typical_p": 1.0,
90
+ "use_bfloat16": false
91
+ },
92
+ "vision_config_dict": {
93
+ "hidden_size": 1024,
94
+ "image_size": 336,
95
+ "intermediate_size": 4096,
96
+ "num_attention_heads": 16,
97
+ "num_hidden_layers": 24,
98
+ "patch_size": 14,
99
+ "projection_dim": 1280
100
  }
101
+ }
configuration_clip.py CHANGED
@@ -26,9 +26,9 @@ if TYPE_CHECKING:
26
  from transformers.utils import TensorType
27
 
28
  from transformers.configuration_utils import PretrainedConfig
 
29
  from transformers.utils import logging
30
 
31
-
32
  logger = logging.get_logger(__name__)
33
 
34
 
@@ -50,25 +50,33 @@ class CLIPTextConfig(PretrainedConfig):
50
  Dimensionality of the encoder layers and the pooler layer.
51
  intermediate_size (`int`, *optional*, defaults to 2048):
52
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 
 
53
  num_hidden_layers (`int`, *optional*, defaults to 12):
54
  Number of hidden layers in the Transformer encoder.
55
  num_attention_heads (`int`, *optional*, defaults to 8):
56
  Number of attention heads for each attention layer in the Transformer encoder.
57
- max_position_embeddings (`int`, *optional*, defaults to 77):`
58
  The maximum sequence length that this model might ever be used with. Typically set this to something large
59
  just in case (e.g., 512 or 1024 or 2048).
60
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
61
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
62
  `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
63
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
64
  The epsilon used by the layer normalization layers.
65
  attention_dropout (`float`, *optional*, defaults to 0.0):
66
  The dropout ratio for the attention probabilities.
67
  initializer_range (`float`, *optional*, defaults to 0.02):
68
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
69
- initializer_factor (`float`, *optional*, defaults to 1):
70
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
71
  testing).
 
 
 
 
 
 
72
 
73
  Example:
74
 
@@ -84,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig):
84
  >>> # Accessing the model configuration
85
  >>> configuration = model.config
86
  ```"""
 
87
  model_type = "clip_text_model"
88
 
89
  def __init__(
@@ -95,18 +104,16 @@ class CLIPTextConfig(PretrainedConfig):
95
  num_hidden_layers=12,
96
  num_attention_heads=8,
97
  max_position_embeddings=77,
98
- hidden_act="gelu",
99
  layer_norm_eps=1e-5,
100
  attention_dropout=0.0,
101
  initializer_range=0.02,
102
  initializer_factor=1.0,
103
- q_bias=True,
104
- k_bias=True,
105
- v_bias=True,
106
- post_layernorm=False,
107
  pad_token_id=1,
108
- bos_token_id=0,
109
- eos_token_id=2,
110
  **kwargs,
111
  ):
112
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
@@ -122,14 +129,12 @@ class CLIPTextConfig(PretrainedConfig):
122
  self.hidden_act = hidden_act
123
  self.initializer_range = initializer_range
124
  self.initializer_factor = initializer_factor
125
- self.q_bias=q_bias
126
- self.k_bias=k_bias
127
- self.v_bias=v_bias
128
- self.post_layernorm = post_layernorm
129
  self.attention_dropout = attention_dropout
130
 
131
  @classmethod
132
  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
 
133
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
134
 
135
  # get the text config dict if we are loading from CLIPConfig
@@ -160,24 +165,28 @@ class CLIPVisionConfig(PretrainedConfig):
160
  Dimensionality of the encoder layers and the pooler layer.
161
  intermediate_size (`int`, *optional*, defaults to 3072):
162
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 
 
163
  num_hidden_layers (`int`, *optional*, defaults to 12):
164
  Number of hidden layers in the Transformer encoder.
165
  num_attention_heads (`int`, *optional*, defaults to 12):
166
  Number of attention heads for each attention layer in the Transformer encoder.
 
 
167
  image_size (`int`, *optional*, defaults to 224):
168
  The size (resolution) of each image.
169
  patch_size (`int`, *optional*, defaults to 32):
170
  The size (resolution) of each patch.
171
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
172
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
173
- `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
174
- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
175
  The epsilon used by the layer normalization layers.
176
  attention_dropout (`float`, *optional*, defaults to 0.0):
177
  The dropout ratio for the attention probabilities.
178
  initializer_range (`float`, *optional*, defaults to 0.02):
179
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
180
- initializer_factor (`float`, *optional*, defaults to 1):
181
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
182
  testing).
183
 
@@ -208,15 +217,11 @@ class CLIPVisionConfig(PretrainedConfig):
208
  num_channels=3,
209
  image_size=224,
210
  patch_size=32,
211
- hidden_act="gelu",
212
  layer_norm_eps=1e-5,
213
  attention_dropout=0.0,
214
  initializer_range=0.02,
215
  initializer_factor=1.0,
216
- q_bias=True,
217
- k_bias=True,
218
- v_bias=True,
219
- post_layernorm=False,
220
  **kwargs,
221
  ):
222
  super().__init__(**kwargs)
@@ -231,16 +236,14 @@ class CLIPVisionConfig(PretrainedConfig):
231
  self.image_size = image_size
232
  self.initializer_range = initializer_range
233
  self.initializer_factor = initializer_factor
234
- self.q_bias=q_bias
235
- self.k_bias=k_bias
236
- self.v_bias=v_bias
237
- self.post_layernorm = post_layernorm
238
  self.attention_dropout = attention_dropout
239
  self.layer_norm_eps = layer_norm_eps
240
  self.hidden_act = hidden_act
241
 
242
  @classmethod
243
  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
 
 
244
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
245
 
246
  # get the vision config dict if we are loading from CLIPConfig
@@ -272,9 +275,9 @@ class CLIPConfig(PretrainedConfig):
272
  vision_config (`dict`, *optional*):
273
  Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
274
  projection_dim (`int`, *optional*, defaults to 512):
275
- Dimentionality of text and vision projection layers.
276
  logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
277
- The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
278
  kwargs (*optional*):
279
  Dictionary of keyword arguments.
280
 
@@ -303,7 +306,6 @@ class CLIPConfig(PretrainedConfig):
303
  ```"""
304
 
305
  model_type = "clip"
306
- is_composition = True
307
 
308
  def __init__(
309
  self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
@@ -339,9 +341,9 @@ class CLIPConfig(PretrainedConfig):
339
  else:
340
  message = (
341
  f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
342
- f'value `text_config["{key}"]` will be overriden.'
343
  )
344
- logger.warning(message)
345
 
346
  # Update all values in `text_config` with the ones in `_text_config_dict`.
347
  text_config.update(_text_config_dict)
@@ -371,9 +373,9 @@ class CLIPConfig(PretrainedConfig):
371
  else:
372
  message = (
373
  f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
374
- f'The value `vision_config["{key}"]` will be overriden.'
375
  )
376
- logger.warning(message)
377
 
378
  # Update all values in `vision_config` with the ones in `_vision_config_dict`.
379
  vision_config.update(_vision_config_dict)
@@ -405,16 +407,48 @@ class CLIPConfig(PretrainedConfig):
405
 
406
  return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
407
 
408
- def to_dict(self):
409
- """
410
- Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
411
-
412
- Returns:
413
- `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
414
- """
415
- output = copy.deepcopy(self.__dict__)
416
- output["text_config"] = self.text_config.to_dict()
417
- output["vision_config"] = self.vision_config.to_dict()
418
- output["model_type"] = self.__class__.model_type
419
- return output
420
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  from transformers.utils import TensorType
27
 
28
  from transformers.configuration_utils import PretrainedConfig
29
+ from transformers.onnx import OnnxConfig
30
  from transformers.utils import logging
31
 
 
32
  logger = logging.get_logger(__name__)
33
 
34
 
 
50
  Dimensionality of the encoder layers and the pooler layer.
51
  intermediate_size (`int`, *optional*, defaults to 2048):
52
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
53
+ projection_dim (`int`, *optional*, defaults to 512):
54
+ Dimensionality of text and vision projection layers.
55
  num_hidden_layers (`int`, *optional*, defaults to 12):
56
  Number of hidden layers in the Transformer encoder.
57
  num_attention_heads (`int`, *optional*, defaults to 8):
58
  Number of attention heads for each attention layer in the Transformer encoder.
59
+ max_position_embeddings (`int`, *optional*, defaults to 77):
60
  The maximum sequence length that this model might ever be used with. Typically set this to something large
61
  just in case (e.g., 512 or 1024 or 2048).
62
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
63
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
64
  `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
65
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
66
  The epsilon used by the layer normalization layers.
67
  attention_dropout (`float`, *optional*, defaults to 0.0):
68
  The dropout ratio for the attention probabilities.
69
  initializer_range (`float`, *optional*, defaults to 0.02):
70
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
71
+ initializer_factor (`float`, *optional*, defaults to 1.0):
72
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
73
  testing).
74
+ pad_token_id (`int`, *optional*, defaults to 1):
75
+ Padding token id.
76
+ bos_token_id (`int`, *optional*, defaults to 49406):
77
+ Beginning of stream token id.
78
+ eos_token_id (`int`, *optional*, defaults to 49407):
79
+ End of stream token id.
80
 
81
  Example:
82
 
 
92
  >>> # Accessing the model configuration
93
  >>> configuration = model.config
94
  ```"""
95
+
96
  model_type = "clip_text_model"
97
 
98
  def __init__(
 
104
  num_hidden_layers=12,
105
  num_attention_heads=8,
106
  max_position_embeddings=77,
107
+ hidden_act="quick_gelu",
108
  layer_norm_eps=1e-5,
109
  attention_dropout=0.0,
110
  initializer_range=0.02,
111
  initializer_factor=1.0,
112
+ # This differs from `CLIPTokenizer`'s default and from openai/clip
113
+ # See https://github.com/huggingface/transformers/pull/24773#issuecomment-1632287538
 
 
114
  pad_token_id=1,
115
+ bos_token_id=49406,
116
+ eos_token_id=49407,
117
  **kwargs,
118
  ):
119
  super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
 
129
  self.hidden_act = hidden_act
130
  self.initializer_range = initializer_range
131
  self.initializer_factor = initializer_factor
 
 
 
 
132
  self.attention_dropout = attention_dropout
133
 
134
  @classmethod
135
  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
136
+ cls._set_token_in_kwargs(kwargs)
137
+
138
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
139
 
140
  # get the text config dict if we are loading from CLIPConfig
 
165
  Dimensionality of the encoder layers and the pooler layer.
166
  intermediate_size (`int`, *optional*, defaults to 3072):
167
  Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
168
+ projection_dim (`int`, *optional*, defaults to 512):
169
+ Dimensionality of text and vision projection layers.
170
  num_hidden_layers (`int`, *optional*, defaults to 12):
171
  Number of hidden layers in the Transformer encoder.
172
  num_attention_heads (`int`, *optional*, defaults to 12):
173
  Number of attention heads for each attention layer in the Transformer encoder.
174
+ num_channels (`int`, *optional*, defaults to 3):
175
+ The number of input channels.
176
  image_size (`int`, *optional*, defaults to 224):
177
  The size (resolution) of each image.
178
  patch_size (`int`, *optional*, defaults to 32):
179
  The size (resolution) of each patch.
180
  hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
181
  The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
182
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
183
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
184
  The epsilon used by the layer normalization layers.
185
  attention_dropout (`float`, *optional*, defaults to 0.0):
186
  The dropout ratio for the attention probabilities.
187
  initializer_range (`float`, *optional*, defaults to 0.02):
188
  The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
189
+ initializer_factor (`float`, *optional*, defaults to 1.0):
190
  A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
191
  testing).
192
 
 
217
  num_channels=3,
218
  image_size=224,
219
  patch_size=32,
220
+ hidden_act="quick_gelu",
221
  layer_norm_eps=1e-5,
222
  attention_dropout=0.0,
223
  initializer_range=0.02,
224
  initializer_factor=1.0,
 
 
 
 
225
  **kwargs,
226
  ):
227
  super().__init__(**kwargs)
 
236
  self.image_size = image_size
237
  self.initializer_range = initializer_range
238
  self.initializer_factor = initializer_factor
 
 
 
 
239
  self.attention_dropout = attention_dropout
240
  self.layer_norm_eps = layer_norm_eps
241
  self.hidden_act = hidden_act
242
 
243
  @classmethod
244
  def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
245
+ cls._set_token_in_kwargs(kwargs)
246
+
247
  config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
248
 
249
  # get the vision config dict if we are loading from CLIPConfig
 
275
  vision_config (`dict`, *optional*):
276
  Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
277
  projection_dim (`int`, *optional*, defaults to 512):
278
+ Dimensionality of text and vision projection layers.
279
  logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
280
+ The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation.
281
  kwargs (*optional*):
282
  Dictionary of keyword arguments.
283
 
 
306
  ```"""
307
 
308
  model_type = "clip"
 
309
 
310
  def __init__(
311
  self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
 
341
  else:
342
  message = (
343
  f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
344
+ f'value `text_config["{key}"]` will be overridden.'
345
  )
346
+ logger.info(message)
347
 
348
  # Update all values in `text_config` with the ones in `_text_config_dict`.
349
  text_config.update(_text_config_dict)
 
373
  else:
374
  message = (
375
  f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
376
+ f'The value `vision_config["{key}"]` will be overridden.'
377
  )
378
+ logger.info(message)
379
 
380
  # Update all values in `vision_config` with the ones in `_vision_config_dict`.
381
  vision_config.update(_vision_config_dict)
 
407
 
408
  return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
409
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
+ class CLIPOnnxConfig(OnnxConfig):
412
+ @property
413
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
414
+ return OrderedDict(
415
+ [
416
+ ("input_ids", {0: "batch", 1: "sequence"}),
417
+ ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
418
+ ("attention_mask", {0: "batch", 1: "sequence"}),
419
+ ]
420
+ )
421
+
422
+ @property
423
+ def outputs(self) -> Mapping[str, Mapping[int, str]]:
424
+ return OrderedDict(
425
+ [
426
+ ("logits_per_image", {0: "batch"}),
427
+ ("logits_per_text", {0: "batch"}),
428
+ ("text_embeds", {0: "batch"}),
429
+ ("image_embeds", {0: "batch"}),
430
+ ]
431
+ )
432
+
433
+ @property
434
+ def atol_for_validation(self) -> float:
435
+ return 1e-4
436
+
437
+ def generate_dummy_inputs(
438
+ self,
439
+ processor: "ProcessorMixin",
440
+ batch_size: int = -1,
441
+ seq_length: int = -1,
442
+ framework: Optional["TensorType"] = None,
443
+ ) -> Mapping[str, Any]:
444
+ text_input_dict = super().generate_dummy_inputs(
445
+ processor.tokenizer, batch_size=batch_size, seq_length=seq_length, framework=framework
446
+ )
447
+ image_input_dict = super().generate_dummy_inputs(
448
+ processor.image_processor, batch_size=batch_size, framework=framework
449
+ )
450
+ return {**text_input_dict, **image_input_dict}
451
+
452
+ @property
453
+ def default_onnx_opset(self) -> int:
454
+ return 14
pytorch_model.bin → model.safetensors RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0921e6e15fae7a2a28459008d97c50c7fc099bad0bc57bf1573f28e9354a3cbc
3
- size 1219403118
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b735de584f3270fe5a818cba1724bf387d90e14ecf703fb5ac2829a16c711961
3
+ size 2314403228