BAAI
/

Feature Extraction
Transformers
PyTorch
clip
custom_code
QuanSun commited on
Commit
902b080
1 Parent(s): 5e2e6de

upload py and json

Browse files
CLIP.png ADDED
config.json ADDED
@@ -0,0 +1,182 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "_name_or_path": "EVA-CLIP-8B",
4
+ "architectures": [
5
+ "EvaCLIPModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_evaclip.EvaCLIPConfig",
9
+ "AutoModel": "modeling_evaclip.EvaCLIPModel"
10
+ },
11
+ "initializer_factor": 1.0,
12
+ "logit_scale_init_value": 2.659260036932778,
13
+ "model_type": "clip",
14
+ "projection_dim": 1280,
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "add_cross_attention": false,
18
+ "architectures": null,
19
+ "attention_dropout": 0.0,
20
+ "bad_words_ids": null,
21
+ "begin_suppress_tokens": null,
22
+ "bos_token_id": 0,
23
+ "chunk_size_feed_forward": 0,
24
+ "cross_attention_hidden_size": null,
25
+ "decoder_start_token_id": null,
26
+ "diversity_penalty": 0.0,
27
+ "do_sample": false,
28
+ "dropout": 0.0,
29
+ "early_stopping": false,
30
+ "encoder_no_repeat_ngram_size": 0,
31
+ "eos_token_id": 2,
32
+ "exponential_decay_length_penalty": null,
33
+ "finetuning_task": null,
34
+ "forced_bos_token_id": null,
35
+ "forced_eos_token_id": null,
36
+ "hidden_act": "gelu",
37
+ "hidden_size": 1280,
38
+ "id2label": {
39
+ "0": "LABEL_0",
40
+ "1": "LABEL_1"
41
+ },
42
+ "initializer_factor": 1.0,
43
+ "initializer_range": 0.02,
44
+ "intermediate_size": 5120,
45
+ "is_decoder": false,
46
+ "is_encoder_decoder": false,
47
+ "k_bias": true,
48
+ "label2id": {
49
+ "LABEL_0": 0,
50
+ "LABEL_1": 1
51
+ },
52
+ "layer_norm_eps": 1e-05,
53
+ "length_penalty": 1.0,
54
+ "max_length": 20,
55
+ "max_position_embeddings": 77,
56
+ "min_length": 0,
57
+ "model_type": "clip_text_model",
58
+ "no_repeat_ngram_size": 0,
59
+ "num_attention_heads": 20,
60
+ "num_beam_groups": 1,
61
+ "num_beams": 1,
62
+ "num_hidden_layers": 32,
63
+ "num_return_sequences": 1,
64
+ "output_attentions": false,
65
+ "output_hidden_states": false,
66
+ "output_scores": false,
67
+ "pad_token_id": 1,
68
+ "post_layernorm": false,
69
+ "prefix": null,
70
+ "problem_type": null,
71
+ "projection_dim": 512,
72
+ "pruned_heads": {},
73
+ "q_bias": true,
74
+ "remove_invalid_values": false,
75
+ "repetition_penalty": 1.0,
76
+ "return_dict": true,
77
+ "return_dict_in_generate": false,
78
+ "sep_token_id": null,
79
+ "suppress_tokens": null,
80
+ "task_specific_params": null,
81
+ "temperature": 1.0,
82
+ "tf_legacy_loss": false,
83
+ "tie_encoder_decoder": false,
84
+ "tie_word_embeddings": true,
85
+ "tokenizer_class": null,
86
+ "top_k": 50,
87
+ "top_p": 1.0,
88
+ "torch_dtype": null,
89
+ "torchscript": false,
90
+ "transformers_version": "4.28.1",
91
+ "typical_p": 1.0,
92
+ "use_bfloat16": true,
93
+ "use_rms_norm": false,
94
+ "v_bias": true,
95
+ "vocab_size": 49408
96
+ },
97
+ "torch_dtype": "float32",
98
+ "transformers_version": "4.28.1",
99
+ "vision_config": {
100
+ "_name_or_path": "",
101
+ "add_cross_attention": false,
102
+ "architectures": null,
103
+ "attention_dropout": 0.0,
104
+ "bad_words_ids": null,
105
+ "begin_suppress_tokens": null,
106
+ "bos_token_id": null,
107
+ "chunk_size_feed_forward": 0,
108
+ "cross_attention_hidden_size": null,
109
+ "decoder_start_token_id": null,
110
+ "diversity_penalty": 0.0,
111
+ "do_sample": false,
112
+ "dropout": 0.0,
113
+ "early_stopping": false,
114
+ "encoder_no_repeat_ngram_size": 0,
115
+ "eos_token_id": null,
116
+ "exponential_decay_length_penalty": null,
117
+ "finetuning_task": null,
118
+ "forced_bos_token_id": null,
119
+ "forced_eos_token_id": null,
120
+ "hidden_act": "gelu",
121
+ "hidden_size": 4096,
122
+ "id2label": {
123
+ "0": "LABEL_0",
124
+ "1": "LABEL_1"
125
+ },
126
+ "image_size": 224,
127
+ "initializer_factor": 1.0,
128
+ "initializer_range": 0.02,
129
+ "intermediate_size": 20480,
130
+ "is_decoder": false,
131
+ "is_encoder_decoder": false,
132
+ "k_bias": false,
133
+ "label2id": {
134
+ "LABEL_0": 0,
135
+ "LABEL_1": 1
136
+ },
137
+ "layer_norm_eps": 1e-06,
138
+ "length_penalty": 1.0,
139
+ "max_length": 20,
140
+ "min_length": 0,
141
+ "model_type": "clip_vision_model",
142
+ "no_repeat_ngram_size": 0,
143
+ "num_attention_heads": 32,
144
+ "num_beam_groups": 1,
145
+ "num_beams": 1,
146
+ "num_channels": 3,
147
+ "num_hidden_layers": 32,
148
+ "num_return_sequences": 1,
149
+ "output_attentions": false,
150
+ "output_hidden_states": false,
151
+ "output_scores": false,
152
+ "pad_token_id": null,
153
+ "patch_size": 14,
154
+ "post_layernorm": false,
155
+ "prefix": null,
156
+ "problem_type": null,
157
+ "projection_dim": 512,
158
+ "pruned_heads": {},
159
+ "q_bias": false,
160
+ "remove_invalid_values": false,
161
+ "repetition_penalty": 1.0,
162
+ "return_dict": true,
163
+ "return_dict_in_generate": false,
164
+ "sep_token_id": null,
165
+ "suppress_tokens": null,
166
+ "task_specific_params": null,
167
+ "temperature": 1.0,
168
+ "tf_legacy_loss": false,
169
+ "tie_encoder_decoder": false,
170
+ "tie_word_embeddings": true,
171
+ "tokenizer_class": null,
172
+ "top_k": 50,
173
+ "top_p": 1.0,
174
+ "torch_dtype": null,
175
+ "torchscript": false,
176
+ "transformers_version": "4.28.1",
177
+ "typical_p": 1.0,
178
+ "use_bfloat16": true,
179
+ "use_rms_norm": true,
180
+ "v_bias": false
181
+ }
182
+ }
configuration_evaclip.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """ EvaCLIP model configuration"""
3
+ # Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
4
+ # and adjusted for evaclip
5
+
6
+ import copy
7
+ import os
8
+ from collections import OrderedDict
9
+ from typing import TYPE_CHECKING, Any, Mapping, Optional, Union
10
+
11
+
12
+ if TYPE_CHECKING:
13
+ from transformers.processing_utils import ProcessorMixin
14
+ from transformers.utils import TensorType
15
+
16
+ from transformers.configuration_utils import PretrainedConfig
17
+ from transformers.utils import logging
18
+
19
+
20
+ logger = logging.get_logger(__name__)
21
+
22
+
23
+ class EvaCLIPTextConfig(PretrainedConfig):
24
+ r"""
25
+ This is the configuration class to store the configuration of a [`CLIPTextModel`]. It is used to instantiate a CLIP
26
+ text encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
27
+ with the defaults will yield a similar configuration to that of the text encoder of the CLIP
28
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
29
+
30
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
31
+ documentation from [`PretrainedConfig`] for more information.
32
+
33
+ Args:
34
+ vocab_size (`int`, *optional*, defaults to 49408):
35
+ Vocabulary size of the CLIP text model. Defines the number of different tokens that can be represented by
36
+ the `inputs_ids` passed when calling [`CLIPModel`].
37
+ hidden_size (`int`, *optional*, defaults to 512):
38
+ Dimensionality of the encoder layers and the pooler layer.
39
+ intermediate_size (`int`, *optional*, defaults to 2048):
40
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
41
+ num_hidden_layers (`int`, *optional*, defaults to 12):
42
+ Number of hidden layers in the Transformer encoder.
43
+ num_attention_heads (`int`, *optional*, defaults to 8):
44
+ Number of attention heads for each attention layer in the Transformer encoder.
45
+ max_position_embeddings (`int`, *optional*, defaults to 77):
46
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
47
+ just in case (e.g., 512 or 1024 or 2048).
48
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
49
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
50
+ `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
51
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
52
+ The epsilon used by the layer normalization layers.
53
+ attention_dropout (`float`, *optional*, defaults to 0.0):
54
+ The dropout ratio for the attention probabilities.
55
+ initializer_range (`float`, *optional*, defaults to 0.02):
56
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
57
+ initializer_factor (`float`, *optional*, defaults to 1):
58
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
59
+ testing).
60
+
61
+ Example:
62
+
63
+ ```python
64
+ >>> from transformers import CLIPTextConfig, CLIPTextModel
65
+
66
+ >>> # Initializing a CLIPTextConfig with openai/clip-vit-base-patch32 style configuration
67
+ >>> configuration = CLIPTextConfig()
68
+
69
+ >>> # Initializing a CLIPTextModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
70
+ >>> model = CLIPTextModel(configuration)
71
+
72
+ >>> # Accessing the model configuration
73
+ >>> configuration = model.config
74
+ ```"""
75
+ model_type = "clip_text_model"
76
+
77
+ def __init__(
78
+ self,
79
+ vocab_size=49408,
80
+ hidden_size=512,
81
+ intermediate_size=2048,
82
+ projection_dim=512,
83
+ num_hidden_layers=12,
84
+ num_attention_heads=8,
85
+ max_position_embeddings=77,
86
+ hidden_act="gelu",
87
+ layer_norm_eps=1e-5,
88
+ attention_dropout=0.0,
89
+ initializer_range=0.02,
90
+ initializer_factor=1.0,
91
+ q_bias=True,
92
+ k_bias=True,
93
+ v_bias=True,
94
+ post_layernorm=False,
95
+ pad_token_id=1,
96
+ bos_token_id=0,
97
+ eos_token_id=2,
98
+ use_rms_norm=False,
99
+ **kwargs,
100
+ ):
101
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
102
+
103
+ self.vocab_size = vocab_size
104
+ self.hidden_size = hidden_size
105
+ self.intermediate_size = intermediate_size
106
+ self.projection_dim = projection_dim
107
+ self.num_hidden_layers = num_hidden_layers
108
+ self.num_attention_heads = num_attention_heads
109
+ self.max_position_embeddings = max_position_embeddings
110
+ self.layer_norm_eps = layer_norm_eps
111
+ self.hidden_act = hidden_act
112
+ self.initializer_range = initializer_range
113
+ self.initializer_factor = initializer_factor
114
+ self.q_bias=q_bias
115
+ self.k_bias=k_bias
116
+ self.v_bias=v_bias
117
+ self.post_layernorm = post_layernorm
118
+ self.attention_dropout = attention_dropout
119
+ self.use_rms_norm = use_rms_norm
120
+
121
+ @classmethod
122
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
123
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
124
+
125
+ # get the text config dict if we are loading from CLIPConfig
126
+ if config_dict.get("model_type") == "clip":
127
+ config_dict = config_dict["text_config"]
128
+
129
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
130
+ logger.warning(
131
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
132
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
133
+ )
134
+
135
+ return cls.from_dict(config_dict, **kwargs)
136
+
137
+
138
+ class EvaCLIPVisionConfig(PretrainedConfig):
139
+ r"""
140
+ This is the configuration class to store the configuration of a [`CLIPVisionModel`]. It is used to instantiate a
141
+ CLIP vision encoder according to the specified arguments, defining the model architecture. Instantiating a
142
+ configuration with the defaults will yield a similar configuration to that of the vision encoder of the CLIP
143
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
144
+
145
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
146
+ documentation from [`PretrainedConfig`] for more information.
147
+
148
+ Args:
149
+ hidden_size (`int`, *optional*, defaults to 768):
150
+ Dimensionality of the encoder layers and the pooler layer.
151
+ intermediate_size (`int`, *optional*, defaults to 3072):
152
+ Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
153
+ num_hidden_layers (`int`, *optional*, defaults to 12):
154
+ Number of hidden layers in the Transformer encoder.
155
+ num_attention_heads (`int`, *optional*, defaults to 12):
156
+ Number of attention heads for each attention layer in the Transformer encoder.
157
+ image_size (`int`, *optional*, defaults to 224):
158
+ The size (resolution) of each image.
159
+ patch_size (`int`, *optional*, defaults to 32):
160
+ The size (resolution) of each patch.
161
+ hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
162
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
163
+ `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
164
+ layer_norm_eps (`float`, *optional*, defaults to 1e-5):
165
+ The epsilon used by the layer normalization layers.
166
+ attention_dropout (`float`, *optional*, defaults to 0.0):
167
+ The dropout ratio for the attention probabilities.
168
+ initializer_range (`float`, *optional*, defaults to 0.02):
169
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
170
+ initializer_factor (`float`, *optional*, defaults to 1):
171
+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
172
+ testing).
173
+
174
+ Example:
175
+
176
+ ```python
177
+ >>> from transformers import CLIPVisionConfig, CLIPVisionModel
178
+
179
+ >>> # Initializing a CLIPVisionConfig with openai/clip-vit-base-patch32 style configuration
180
+ >>> configuration = CLIPVisionConfig()
181
+
182
+ >>> # Initializing a CLIPVisionModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
183
+ >>> model = CLIPVisionModel(configuration)
184
+
185
+ >>> # Accessing the model configuration
186
+ >>> configuration = model.config
187
+ ```"""
188
+
189
+ model_type = "clip_vision_model"
190
+
191
+ def __init__(
192
+ self,
193
+ hidden_size=768,
194
+ intermediate_size=3072,
195
+ projection_dim=512,
196
+ num_hidden_layers=12,
197
+ num_attention_heads=12,
198
+ num_channels=3,
199
+ image_size=224,
200
+ patch_size=32,
201
+ hidden_act="gelu",
202
+ layer_norm_eps=1e-5,
203
+ attention_dropout=0.0,
204
+ initializer_range=0.02,
205
+ initializer_factor=1.0,
206
+ q_bias=True,
207
+ k_bias=True,
208
+ v_bias=True,
209
+ post_layernorm=False,
210
+ use_rms_norm=True,
211
+ **kwargs,
212
+ ):
213
+ super().__init__(**kwargs)
214
+
215
+ self.hidden_size = hidden_size
216
+ self.intermediate_size = intermediate_size
217
+ self.projection_dim = projection_dim
218
+ self.num_hidden_layers = num_hidden_layers
219
+ self.num_attention_heads = num_attention_heads
220
+ self.num_channels = num_channels
221
+ self.patch_size = patch_size
222
+ self.image_size = image_size
223
+ self.initializer_range = initializer_range
224
+ self.initializer_factor = initializer_factor
225
+ self.q_bias=q_bias
226
+ self.k_bias=k_bias
227
+ self.v_bias=v_bias
228
+ self.post_layernorm = post_layernorm
229
+ self.attention_dropout = attention_dropout
230
+ self.layer_norm_eps = layer_norm_eps
231
+ self.hidden_act = hidden_act
232
+ self.use_rms_norm = use_rms_norm
233
+
234
+ @classmethod
235
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
236
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
237
+
238
+ # get the vision config dict if we are loading from CLIPConfig
239
+ if config_dict.get("model_type") == "clip":
240
+ config_dict = config_dict["vision_config"]
241
+
242
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
243
+ logger.warning(
244
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
245
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
246
+ )
247
+
248
+ return cls.from_dict(config_dict, **kwargs)
249
+
250
+
251
+ class EvaCLIPConfig(PretrainedConfig):
252
+ r"""
253
+ [`CLIPConfig`] is the configuration class to store the configuration of a [`CLIPModel`]. It is used to instantiate
254
+ a CLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
255
+ a configuration with the defaults will yield a similar configuration to that of the CLIP
256
+ [openai/clip-vit-base-patch32](https://huggingface.co/openai/clip-vit-base-patch32) architecture.
257
+
258
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
259
+ documentation from [`PretrainedConfig`] for more information.
260
+
261
+ Args:
262
+ text_config (`dict`, *optional*):
263
+ Dictionary of configuration options used to initialize [`CLIPTextConfig`].
264
+ vision_config (`dict`, *optional*):
265
+ Dictionary of configuration options used to initialize [`CLIPVisionConfig`].
266
+ projection_dim (`int`, *optional*, defaults to 512):
267
+ Dimentionality of text and vision projection layers.
268
+ logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
269
+ The inital value of the *logit_scale* paramter. Default is used as per the original CLIP implementation.
270
+ kwargs (*optional*):
271
+ Dictionary of keyword arguments.
272
+
273
+ Example:
274
+
275
+ ```python
276
+ >>> from transformers import CLIPConfig, CLIPModel
277
+
278
+ >>> # Initializing a CLIPConfig with openai/clip-vit-base-patch32 style configuration
279
+ >>> configuration = CLIPConfig()
280
+
281
+ >>> # Initializing a CLIPModel (with random weights) from the openai/clip-vit-base-patch32 style configuration
282
+ >>> model = CLIPModel(configuration)
283
+
284
+ >>> # Accessing the model configuration
285
+ >>> configuration = model.config
286
+
287
+ >>> # We can also initialize a CLIPConfig from a CLIPTextConfig and a CLIPVisionConfig
288
+ >>> from transformers import CLIPTextConfig, CLIPVisionConfig
289
+
290
+ >>> # Initializing a CLIPText and CLIPVision configuration
291
+ >>> config_text = CLIPTextConfig()
292
+ >>> config_vision = CLIPVisionConfig()
293
+
294
+ >>> config = CLIPConfig.from_text_vision_configs(config_text, config_vision)
295
+ ```"""
296
+
297
+ model_type = "clip"
298
+ is_composition = True
299
+
300
+ def __init__(
301
+ self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs
302
+ ):
303
+ # If `_config_dict` exist, we use them for the backward compatibility.
304
+ # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot
305
+ # of confusion!).
306
+ text_config_dict = kwargs.pop("text_config_dict", None)
307
+ vision_config_dict = kwargs.pop("vision_config_dict", None)
308
+
309
+ super().__init__(**kwargs)
310
+
311
+ # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in
312
+ # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most
313
+ # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`.
314
+ if text_config_dict is not None:
315
+ if text_config is None:
316
+ text_config = {}
317
+
318
+ # This is the complete result when using `text_config_dict`.
319
+ _text_config_dict = EvaCLIPTextConfig(**text_config_dict).to_dict()
320
+
321
+ # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different.
322
+ for key, value in _text_config_dict.items():
323
+ if key in text_config and value != text_config[key] and key not in ["transformers_version"]:
324
+ # If specified in `text_config_dict`
325
+ if key in text_config_dict:
326
+ message = (
327
+ f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. "
328
+ f'The value `text_config_dict["{key}"]` will be used instead.'
329
+ )
330
+ # If inferred from default argument values (just to be super careful)
331
+ else:
332
+ message = (
333
+ f"`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The "
334
+ f'value `text_config["{key}"]` will be overriden.'
335
+ )
336
+ logger.warning(message)
337
+
338
+ # Update all values in `text_config` with the ones in `_text_config_dict`.
339
+ text_config.update(_text_config_dict)
340
+
341
+ if vision_config_dict is not None:
342
+ if vision_config is None:
343
+ vision_config = {}
344
+
345
+ # This is the complete result when using `vision_config_dict`.
346
+ _vision_config_dict = EvaCLIPVisionConfig(**vision_config_dict).to_dict()
347
+ # convert keys to string instead of integer
348
+ if "id2label" in _vision_config_dict:
349
+ _vision_config_dict["id2label"] = {
350
+ str(key): value for key, value in _vision_config_dict["id2label"].items()
351
+ }
352
+
353
+ # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different.
354
+ for key, value in _vision_config_dict.items():
355
+ if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]:
356
+ # If specified in `vision_config_dict`
357
+ if key in vision_config_dict:
358
+ message = (
359
+ f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different "
360
+ f'values. The value `vision_config_dict["{key}"]` will be used instead.'
361
+ )
362
+ # If inferred from default argument values (just to be super careful)
363
+ else:
364
+ message = (
365
+ f"`vision_config_dict` is provided which will be used to initialize `CLIPVisionConfig`. "
366
+ f'The value `vision_config["{key}"]` will be overriden.'
367
+ )
368
+ logger.warning(message)
369
+
370
+ # Update all values in `vision_config` with the ones in `_vision_config_dict`.
371
+ vision_config.update(_vision_config_dict)
372
+
373
+ if text_config is None:
374
+ text_config = {}
375
+ logger.info("`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.")
376
+
377
+ if vision_config is None:
378
+ vision_config = {}
379
+ logger.info("`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.")
380
+
381
+ self.text_config = EvaCLIPTextConfig(**text_config)
382
+ self.vision_config = EvaCLIPVisionConfig(**vision_config)
383
+
384
+ self.projection_dim = projection_dim
385
+ self.logit_scale_init_value = logit_scale_init_value
386
+ self.initializer_factor = 1.0
387
+
388
+ @classmethod
389
+ def from_text_vision_configs(cls, text_config: EvaCLIPTextConfig, vision_config: EvaCLIPVisionConfig, **kwargs):
390
+ r"""
391
+ Instantiate a [`CLIPConfig`] (or a derived class) from clip text model configuration and clip vision model
392
+ configuration.
393
+
394
+ Returns:
395
+ [`CLIPConfig`]: An instance of a configuration object
396
+ """
397
+
398
+ return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs)
399
+
400
+ def to_dict(self):
401
+ """
402
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
403
+
404
+ Returns:
405
+ `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
406
+ """
407
+ output = copy.deepcopy(self.__dict__)
408
+ output["text_config"] = self.text_config.to_dict()
409
+ output["vision_config"] = self.vision_config.to_dict()
410
+ output["model_type"] = self.__class__.model_type
411
+ return output
convert_evaclip_8b_pytorch_to_hf.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ # Part of the code was taken from:
17
+ # https://github.com/huggingface/transformers/blob/main/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
18
+
19
+ import argparse
20
+
21
+ import os, sys
22
+
23
+ sys.path.append(os.getcwd())
24
+
25
+
26
+ import torch
27
+ from PIL import Image
28
+ from transformers import AutoModel, AutoConfig
29
+ from transformers import CLIPImageProcessor, pipeline, CLIPTokenizer
30
+ from EVA_CLIP_8B.configuration_evaclip import EvaCLIPConfig
31
+ from EVA_CLIP_8B.modeling_evaclip import EvaCLIPModel
32
+
33
+ KEYS_TO_MODIFY_MAPPING = {
34
+ "cls_token":"embeddings.class_embedding",
35
+ "pos_embed":"embeddings.position_embedding.weight",
36
+ "patch_embed.proj":"embeddings.patch_embedding",
37
+ ".positional_embedding":".embeddings.position_embedding.weight",
38
+ ".token_embedding":".embeddings.token_embedding",
39
+ "text.text_projection":"text_projection.weight",
40
+ "mlp.c_fc":"mlp.fc1",
41
+ "mlp.c_proj":"mlp.fc2",
42
+ ".proj.":".out_proj.",
43
+ "q_bias":"q_proj.bias",
44
+ "v_bias":"v_proj.bias",
45
+ "out.":"out_proj.",
46
+ "norm1":"layer_norm1",
47
+ "norm2":"layer_norm2",
48
+ "ln_1":"layer_norm1",
49
+ "ln_2":"layer_norm2",
50
+ "attn":"self_attn",
51
+ "norm.":"post_layernorm.",
52
+ "ln_final":"final_layer_norm",
53
+ "visual.blocks":"vision_model.encoder.layers",
54
+ "text.transformer.resblocks":"text_model.encoder.layers",
55
+ "visual.head":"visual_projection",
56
+ "visual.":"vision_model.",
57
+ "text.":"text_model.",
58
+ }
59
+
60
+ def rename_state_dict(state_dict):
61
+ model_state_dict = {}
62
+
63
+ for key, value in state_dict.items():
64
+ # check if any key needs to be modified
65
+ for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
66
+ if key_to_modify in key:
67
+ key = key.replace(key_to_modify, new_key)
68
+ if "text_projection" in key:
69
+ model_state_dict[key] = value.T
70
+ elif "attn.qkv" in key:
71
+ # split qkv into query key and value
72
+ mixed_qkv = value
73
+ qkv_dim = mixed_qkv.size(0) // 3
74
+
75
+ query_layer = mixed_qkv[:qkv_dim]
76
+ key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
77
+ value_layer = mixed_qkv[qkv_dim * 2 :]
78
+
79
+ model_state_dict[key.replace("qkv", "q_proj")] = query_layer
80
+ model_state_dict[key.replace("qkv", "k_proj")] = key_layer
81
+ model_state_dict[key.replace("qkv", "v_proj")] = value_layer
82
+
83
+ elif "attn.in_proj" in key:
84
+ # split qkv into query key and value
85
+ mixed_qkv = value
86
+ qkv_dim = mixed_qkv.size(0) // 3
87
+
88
+ query_layer = mixed_qkv[:qkv_dim]
89
+ key_layer = mixed_qkv[qkv_dim : qkv_dim * 2]
90
+ value_layer = mixed_qkv[qkv_dim * 2 :]
91
+
92
+ model_state_dict[key.replace("in_proj_", "q_proj.")] = query_layer
93
+ model_state_dict[key.replace("in_proj_", "k_proj.")] = key_layer
94
+ model_state_dict[key.replace("in_proj_", "v_proj.")] = value_layer
95
+
96
+ elif "class_embedding" in key:
97
+ model_state_dict[key] = value[0,0,:]
98
+ elif "vision_model.embeddings.position_embedding" in key:
99
+ model_state_dict[key] = value[0,:,:]
100
+
101
+ else:
102
+ model_state_dict[key] = value
103
+
104
+ return model_state_dict
105
+
106
+ def save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config):
107
+ hf_model.save_pretrained(pytorch_dump_folder_path)
108
+ transformers_config.save_pretrained(pytorch_dump_folder_path)
109
+
110
+ def check_loaded_model(pytorch_dump_folder_path, tokenizer, processor, image, captions):
111
+ hf_config = AutoConfig.from_pretrained(pytorch_dump_folder_path, trust_remote_code=True)
112
+ hf_model = AutoModel.from_pretrained(pytorch_dump_folder_path, config=hf_config, trust_remote_code=True)
113
+ detector = pipeline(model=hf_model, task="zero-shot-image-classification", tokenizer = tokenizer, image_processor=processor)
114
+ detector_probs = detector(image, candidate_labels=captions)
115
+ print(f"text_probs loaded hf_model using pipeline: {detector_probs}")
116
+
117
+ def convert_evaclip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, image_path, save=False):
118
+ processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
119
+ image = Image.open(image_path)
120
+ captions = ["a diagram", "a dog", "a cat"]
121
+ tokenizer = CLIPTokenizer.from_pretrained(pytorch_dump_folder_path)
122
+ input_ids = tokenizer(captions, return_tensors="pt", padding=True).input_ids
123
+ input_pixels = processor(images=image, return_tensors="pt", padding=True).pixel_values
124
+
125
+ transformers_config = EvaCLIPConfig.from_pretrained(config_path)
126
+ hf_model = EvaCLIPModel(transformers_config)
127
+ pt_model_state_dict = torch.load(checkpoint_path, map_location="cpu")
128
+ state_dict = rename_state_dict(pt_model_state_dict)
129
+
130
+ hf_model.load_state_dict(state_dict, strict=True)
131
+
132
+ with torch.no_grad():
133
+ image_features = hf_model.encode_image(input_pixels)
134
+ text_features = hf_model.encode_text(input_ids)
135
+ image_features /= image_features.norm(dim=-1, keepdim=True)
136
+ text_features /= text_features.norm(dim=-1, keepdim=True)
137
+
138
+ label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
139
+ print(f"hf_model label probs: {label_probs}")
140
+
141
+ if save:
142
+ save_model_and_config(pytorch_dump_folder_path, hf_model, transformers_config)
143
+
144
+ check_loaded_model(pytorch_dump_folder_path, tokenizer, processor, image, captions)
145
+
146
+
147
+ if __name__ == "__main__":
148
+ parser = argparse.ArgumentParser()
149
+ parser.add_argument("--pytorch_dump_folder_path", default="EVA_CLIP_8B" ,type=str, help="Path to the output PyTorch model.")
150
+ parser.add_argument("--checkpoint_path", default="EVA_CLIP_8B_psz14_s9B.pt", type=str, help="Path to fairseq checkpoint" )
151
+ parser.add_argument("--config_path", default='EVA_CLIP_8B', type=str, help="Path to hf config.json of model to convert")
152
+ parser.add_argument("--image_path", default='EVA_CLIP_8B/CLIP.png', type=str, help="Path to image")
153
+ parser.add_argument("--save", default=False, action="store_true", help="Save the model and config to the pytorch_dump_folder_path. Default is True.")
154
+
155
+ args = parser.parse_args()
156
+
157
+ convert_evaclip_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.image_path, args.save)
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
modeling_evaclip.py ADDED
@@ -0,0 +1,1059 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ """ EvaCLIP model configuration"""
3
+ # Code mainly copied here: https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/configuration_clip.py
4
+ # and adjusted for evaclip
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Any, Optional, Tuple, Union
8
+
9
+ import torch
10
+ import torch.utils.checkpoint
11
+ from torch import nn
12
+
13
+ from transformers.activations import ACT2FN
14
+ from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
15
+ from transformers.modeling_utils import PreTrainedModel
16
+ from transformers.utils import (
17
+ ModelOutput,
18
+ logging,
19
+ )
20
+ from .configuration_evaclip import EvaCLIPConfig, EvaCLIPTextConfig, EvaCLIPVisionConfig
21
+
22
+ # try:
23
+ # from xformers import ops as xops
24
+ # except ImportError:
25
+ # xops = None
26
+
27
+
28
+ logger = logging.get_logger(__name__)
29
+
30
+
31
+ class RMSNorm(nn.Module):
32
+ """
33
+ adepted from transformers T5LayerNorm
34
+ """
35
+ def __init__(self, hidden_size, eps=1e-6):
36
+ """
37
+ Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
38
+ """
39
+ super().__init__()
40
+ self.weight = nn.Parameter(torch.ones(hidden_size))
41
+ self.variance_epsilon = eps
42
+
43
+ def forward(self, hidden_states):
44
+ # T5 uses a layer_norm which only scales and doesn't shift, which is also known as Root Mean
45
+ # Square Layer Normalization https://arxiv.org/abs/1910.07467 thus varience is calculated
46
+ # w/o mean and there is no bias. Additionally we want to make sure that the accumulation for
47
+ # half-precision inputs is done in fp32
48
+
49
+ variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
50
+ hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
51
+
52
+ # convert into half-precision if necessary
53
+ if self.weight.dtype in [torch.float16, torch.bfloat16]:
54
+ hidden_states = hidden_states.to(self.weight.dtype)
55
+
56
+ return self.weight * hidden_states
57
+
58
+ # Copied from transformers.models.bart.modeling_bart._expand_mask
59
+ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
60
+ """
61
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
62
+ """
63
+ bsz, src_len = mask.size()
64
+ tgt_len = tgt_len if tgt_len is not None else src_len
65
+
66
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
67
+
68
+ inverted_mask = 1.0 - expanded_mask
69
+
70
+ return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
71
+
72
+
73
+ # contrastive loss function, adapted from
74
+ # https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html
75
+ def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
76
+ return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
77
+
78
+ def clip_loss(similarity: torch.Tensor) -> torch.Tensor:
79
+ caption_loss = contrastive_loss(similarity)
80
+ image_loss = contrastive_loss(similarity.t())
81
+ return (caption_loss + image_loss) / 2.0
82
+
83
+
84
+ @dataclass
85
+ class EvaCLIPVisionModelOutput(ModelOutput):
86
+ image_embeds: Optional[torch.FloatTensor] = None
87
+ last_hidden_state: torch.FloatTensor = None
88
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
89
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
90
+
91
+
92
+ @dataclass
93
+ class EvaCLIPTextModelOutput(ModelOutput):
94
+ text_embeds: Optional[torch.FloatTensor] = None
95
+ last_hidden_state: torch.FloatTensor = None
96
+ hidden_states: Optional[Tuple[torch.FloatTensor]] = None
97
+ attentions: Optional[Tuple[torch.FloatTensor]] = None
98
+
99
+
100
+ @dataclass
101
+ class EvaCLIPOutput(ModelOutput):
102
+ loss: Optional[torch.FloatTensor] = None
103
+ logits_per_image: torch.FloatTensor = None
104
+ logits_per_text: torch.FloatTensor = None
105
+ text_embeds: torch.FloatTensor = None
106
+ image_embeds: torch.FloatTensor = None
107
+ text_model_output: BaseModelOutputWithPooling = None
108
+ vision_model_output: BaseModelOutputWithPooling = None
109
+
110
+ def to_tuple(self) -> Tuple[Any]:
111
+ return tuple(
112
+ self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
113
+ for k in self.keys()
114
+ )
115
+
116
+
117
+ class EvaCLIPVisionEmbeddings(nn.Module):
118
+ def __init__(self, config: EvaCLIPVisionConfig):
119
+ super().__init__()
120
+ self.config = config
121
+ self.embed_dim = config.hidden_size
122
+ self.image_size = config.image_size
123
+ self.patch_size = config.patch_size
124
+
125
+ self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
126
+
127
+ self.patch_embedding = nn.Conv2d(
128
+ in_channels=config.num_channels,
129
+ out_channels=self.embed_dim,
130
+ kernel_size=self.patch_size,
131
+ stride=self.patch_size,
132
+ bias=True,
133
+ )
134
+
135
+ self.num_patches = (self.image_size // self.patch_size) ** 2
136
+ self.num_positions = self.num_patches + 1
137
+ self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
138
+ self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent = False)
139
+
140
+ def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
141
+ batch_size = pixel_values.shape[0]
142
+ patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid]
143
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
144
+
145
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
146
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
147
+ embeddings = embeddings + self.position_embedding(self.position_ids)
148
+ return embeddings
149
+
150
+
151
+ class EvaCLIPTextEmbeddings(nn.Module):
152
+ def __init__(self, config: EvaCLIPTextConfig):
153
+ super().__init__()
154
+ embed_dim = config.hidden_size
155
+
156
+ self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
157
+ self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
158
+
159
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
160
+ self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False)
161
+
162
+ def forward(
163
+ self,
164
+ input_ids: Optional[torch.LongTensor] = None,
165
+ position_ids: Optional[torch.LongTensor] = None,
166
+ inputs_embeds: Optional[torch.FloatTensor] = None,
167
+ ) -> torch.Tensor:
168
+ seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
169
+
170
+ if position_ids is None:
171
+ position_ids = self.position_ids[:, :seq_length]
172
+
173
+ if inputs_embeds is None:
174
+ inputs_embeds = self.token_embedding(input_ids)
175
+
176
+ position_embeddings = self.position_embedding(position_ids)
177
+ embeddings = inputs_embeds + position_embeddings
178
+
179
+ return embeddings
180
+
181
+
182
+ class EvaCLIPAttention(nn.Module):
183
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
184
+
185
+ def __init__(self, config):
186
+ super().__init__()
187
+ self.config = config
188
+ self.embed_dim = config.hidden_size
189
+ self.num_heads = config.num_attention_heads
190
+ self.head_dim = self.embed_dim // self.num_heads
191
+ if self.head_dim * self.num_heads != self.embed_dim:
192
+ raise ValueError(
193
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
194
+ f" {self.num_heads})."
195
+ )
196
+ self.scale = self.head_dim**-0.5
197
+ self.dropout = config.attention_dropout
198
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
199
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
200
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
201
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
202
+
203
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
204
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
205
+
206
+ def forward(
207
+ self,
208
+ hidden_states: torch.Tensor,
209
+ attention_mask: Optional[torch.Tensor] = None,
210
+ causal_attention_mask: Optional[torch.Tensor] = None,
211
+ output_attentions: Optional[bool] = False,
212
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
213
+ """Input shape: Batch x Time x Channel"""
214
+
215
+ bsz, tgt_len, embed_dim = hidden_states.size()
216
+
217
+ query_states = self.q_proj(hidden_states) * self.scale
218
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
219
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
220
+
221
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
222
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
223
+ key_states = key_states.view(*proj_shape)
224
+ value_states = value_states.view(*proj_shape)
225
+
226
+ src_len = key_states.size(1)
227
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
228
+
229
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
230
+ raise ValueError(
231
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
232
+ f" {attn_weights.size()}"
233
+ )
234
+
235
+ # apply the causal_attention_mask first
236
+ if causal_attention_mask is not None:
237
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
238
+ raise ValueError(
239
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
240
+ f" {causal_attention_mask.size()}"
241
+ )
242
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
243
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
244
+
245
+ if attention_mask is not None:
246
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
247
+ raise ValueError(
248
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
249
+ )
250
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
251
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
252
+
253
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
254
+
255
+ if output_attentions:
256
+ # this operation is a bit akward, but it's required to
257
+ # make sure that attn_weights keeps its gradient.
258
+ # In order to do so, attn_weights have to reshaped
259
+ # twice and have to be reused in the following
260
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
261
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
262
+ else:
263
+ attn_weights_reshaped = None
264
+
265
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
266
+
267
+ attn_output = torch.bmm(attn_probs, value_states)
268
+
269
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
270
+ raise ValueError(
271
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
272
+ f" {attn_output.size()}"
273
+ )
274
+
275
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
276
+ attn_output = attn_output.transpose(1, 2)
277
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
278
+
279
+ attn_output = self.out_proj(attn_output)
280
+
281
+ return attn_output, attn_weights_reshaped
282
+
283
+ class EvaCLIPTextAttention(nn.Module):
284
+ """Multi-headed attention from 'Attention Is All You Need' paper"""
285
+
286
+ def __init__(self, config):
287
+ super().__init__()
288
+ self.config = config
289
+ self.embed_dim = config.hidden_size
290
+ self.num_heads = config.num_attention_heads
291
+ self.head_dim = self.embed_dim // self.num_heads
292
+ if self.head_dim * self.num_heads != self.embed_dim:
293
+ raise ValueError(
294
+ f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
295
+ f" {self.num_heads})."
296
+ )
297
+ self.scale = self.head_dim**-0.5
298
+ self.dropout = config.attention_dropout
299
+ self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.k_bias)
300
+ self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.v_bias)
301
+ self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=config.q_bias)
302
+ self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=True)
303
+
304
+ def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
305
+ return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
306
+
307
+ def forward(
308
+ self,
309
+ hidden_states: torch.Tensor,
310
+ attention_mask: Optional[torch.Tensor] = None,
311
+ causal_attention_mask: Optional[torch.Tensor] = None,
312
+ output_attentions: Optional[bool] = False,
313
+ ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
314
+ """Input shape: Batch x Time x Channel"""
315
+
316
+ bsz, tgt_len, embed_dim = hidden_states.size()
317
+
318
+ # get query proj
319
+ query_states = self.q_proj(hidden_states)
320
+ key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
321
+ value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
322
+
323
+ proj_shape = (bsz * self.num_heads, -1, self.head_dim)
324
+ query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
325
+ key_states = key_states.view(*proj_shape)
326
+ value_states = value_states.view(*proj_shape)
327
+
328
+ src_len = key_states.size(1)
329
+ attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
330
+
331
+ if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
332
+ raise ValueError(
333
+ f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
334
+ f" {attn_weights.size()}"
335
+ )
336
+
337
+ # apply the causal_attention_mask first
338
+ if causal_attention_mask is not None:
339
+ if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len):
340
+ raise ValueError(
341
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
342
+ f" {causal_attention_mask.size()}"
343
+ )
344
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask
345
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
346
+
347
+ if attention_mask is not None:
348
+ if attention_mask.size() != (bsz, 1, tgt_len, src_len):
349
+ raise ValueError(
350
+ f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
351
+ )
352
+ attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
353
+ attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
354
+
355
+ attn_weights = nn.functional.softmax(attn_weights, dim=-1)
356
+
357
+ if output_attentions:
358
+ # this operation is a bit akward, but it's required to
359
+ # make sure that attn_weights keeps its gradient.
360
+ # In order to do so, attn_weights have to reshaped
361
+ # twice and have to be reused in the following
362
+ attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
363
+ attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
364
+ else:
365
+ attn_weights_reshaped = None
366
+
367
+ attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
368
+
369
+ attn_output = torch.bmm(attn_probs, value_states)
370
+
371
+ if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
372
+ raise ValueError(
373
+ f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
374
+ f" {attn_output.size()}"
375
+ )
376
+
377
+ attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
378
+ attn_output = attn_output.transpose(1, 2)
379
+ attn_output = attn_output.reshape(bsz, tgt_len, embed_dim)
380
+
381
+ attn_output = self.out_proj(attn_output)
382
+
383
+ return attn_output, attn_weights_reshaped
384
+
385
+ class EvaCLIPMLP(nn.Module):
386
+ def __init__(self, config):
387
+ super().__init__()
388
+ self.config = config
389
+ self.activation_fn = ACT2FN[config.hidden_act]
390
+ self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
391
+ self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
392
+
393
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
394
+ hidden_states = self.fc1(hidden_states)
395
+ hidden_states = self.activation_fn(hidden_states)
396
+ hidden_states = self.fc2(hidden_states)
397
+ return hidden_states
398
+
399
+
400
+ class EvaCLIPEncoderLayer(nn.Module):
401
+ def __init__(self, config: EvaCLIPConfig):
402
+ super().__init__()
403
+ self.config = config
404
+ norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
405
+ self.embed_dim = config.hidden_size
406
+ self.post_layernorm = config.post_layernorm if config.post_layernorm is not None else False
407
+ self.self_attn = EvaCLIPAttention(config)
408
+ self.layer_norm1 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
409
+ self.mlp = EvaCLIPMLP(config)
410
+ self.layer_norm2 = norm_layer(self.embed_dim, eps=config.layer_norm_eps)
411
+
412
+ def forward(
413
+ self,
414
+ hidden_states: torch.Tensor,
415
+ attention_mask: torch.Tensor,
416
+ causal_attention_mask: torch.Tensor,
417
+ output_attentions: Optional[bool] = False,
418
+ ) -> Tuple[torch.FloatTensor]:
419
+ residual = hidden_states
420
+
421
+ if not self.post_layernorm:
422
+ hidden_states = self.layer_norm1(hidden_states)
423
+ hidden_states, attn_weights = self.self_attn(
424
+ hidden_states=hidden_states,
425
+ attention_mask=attention_mask,
426
+ causal_attention_mask=causal_attention_mask,
427
+ output_attentions=output_attentions,
428
+ )
429
+ if self.post_layernorm:
430
+ hidden_states = self.layer_norm1(hidden_states)
431
+ hidden_states = residual + hidden_states
432
+ residual = hidden_states
433
+ if not self.post_layernorm:
434
+ hidden_states = self.layer_norm2(hidden_states)
435
+ hidden_states = self.mlp(hidden_states)
436
+ if self.post_layernorm:
437
+ hidden_states = self.layer_norm2(hidden_states)
438
+ hidden_states = residual + hidden_states
439
+
440
+ outputs = (hidden_states,)
441
+
442
+ if output_attentions:
443
+ outputs += (attn_weights,)
444
+
445
+ return outputs
446
+
447
+
448
+ class EvaCLIPPreTrainedModel(PreTrainedModel):
449
+ """
450
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
451
+ models.
452
+ """
453
+
454
+ config_class = EvaCLIPConfig
455
+ base_model_prefix = "clip"
456
+ supports_gradient_checkpointing = True
457
+ _keys_to_ignore_on_load_missing = [r"position_ids"]
458
+
459
+ def _init_weights(self, module):
460
+ """Initialize the weights"""
461
+ factor = self.config.initializer_factor
462
+ if isinstance(module, EvaCLIPTextEmbeddings):
463
+ module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
464
+ module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02)
465
+ elif isinstance(module, EvaCLIPVisionEmbeddings):
466
+ factor = self.config.initializer_factor
467
+ nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor)
468
+ nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor)
469
+ nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor)
470
+ elif isinstance(module, EvaCLIPAttention):
471
+ factor = self.config.initializer_factor
472
+ in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
473
+ out_proj_std = (module.embed_dim**-0.5) * factor
474
+ nn.init.normal_(module.q_proj.weight, std=in_proj_std)
475
+ nn.init.normal_(module.k_proj.weight, std=in_proj_std)
476
+ nn.init.normal_(module.v_proj.weight, std=in_proj_std)
477
+ nn.init.normal_(module.out_proj.weight, std=out_proj_std)
478
+ elif isinstance(module, EvaCLIPMLP):
479
+ factor = self.config.initializer_factor
480
+ in_proj_std = (
481
+ (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor
482
+ )
483
+ fc_std = (2 * module.config.hidden_size) ** -0.5 * factor
484
+ nn.init.normal_(module.fc1.weight, std=fc_std)
485
+ nn.init.normal_(module.fc2.weight, std=in_proj_std)
486
+ elif isinstance(module, EvaCLIPModel):
487
+ nn.init.normal_(
488
+ module.text_projection.weight,
489
+ std=module.text_embed_dim**-0.5 * self.config.initializer_factor,
490
+ )
491
+ nn.init.normal_(
492
+ module.visual_projection.weight,
493
+ std=module.vision_embed_dim**-0.5 * self.config.initializer_factor,
494
+ )
495
+ elif isinstance(module, EvaCLIPVisionModelWithProjection):
496
+ nn.init.normal_(
497
+ module.visual_projection.weight,
498
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
499
+ )
500
+ elif isinstance(module, EvaCLIPTextModelWithProjection):
501
+ nn.init.normal_(
502
+ module.text_projection.weight,
503
+ std=self.config.hidden_size**-0.5 * self.config.initializer_factor,
504
+ )
505
+
506
+ if isinstance(module, nn.LayerNorm):
507
+ module.bias.data.zero_()
508
+ module.weight.data.fill_(1.0)
509
+ if isinstance(module, nn.Linear) and module.bias is not None:
510
+ module.bias.data.zero_()
511
+
512
+ def _set_gradient_checkpointing(self, module, value=False):
513
+ if isinstance(module, EvaCLIPEncoder):
514
+ module.gradient_checkpointing = value
515
+
516
+ class EvaCLIPEncoder(nn.Module):
517
+ """
518
+ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
519
+ [`CLIPEncoderLayer`].
520
+
521
+ Args:
522
+ config: CLIPConfig
523
+ """
524
+
525
+ def __init__(self, config: EvaCLIPConfig):
526
+ super().__init__()
527
+ self.config = config
528
+ self.layers = nn.ModuleList([EvaCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)])
529
+ self.gradient_checkpointing = False
530
+
531
+ def forward(
532
+ self,
533
+ inputs_embeds,
534
+ attention_mask: Optional[torch.Tensor] = None,
535
+ causal_attention_mask: Optional[torch.Tensor] = None,
536
+ output_attentions: Optional[bool] = None,
537
+ output_hidden_states: Optional[bool] = None,
538
+ return_dict: Optional[bool] = None,
539
+ ) -> Union[Tuple, BaseModelOutput]:
540
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
541
+ output_hidden_states = (
542
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
543
+ )
544
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
545
+
546
+ encoder_states = () if output_hidden_states else None
547
+ all_attentions = () if output_attentions else None
548
+
549
+ hidden_states = inputs_embeds
550
+ for idx, encoder_layer in enumerate(self.layers):
551
+ if output_hidden_states:
552
+ encoder_states = encoder_states + (hidden_states,)
553
+ if self.gradient_checkpointing and self.training:
554
+
555
+ def create_custom_forward(module):
556
+ def custom_forward(*inputs):
557
+ return module(*inputs, output_attentions)
558
+
559
+ return custom_forward
560
+
561
+ layer_outputs = torch.utils.checkpoint.checkpoint(
562
+ create_custom_forward(encoder_layer),
563
+ hidden_states,
564
+ attention_mask,
565
+ causal_attention_mask,
566
+ )
567
+ else:
568
+ layer_outputs = encoder_layer(
569
+ hidden_states,
570
+ attention_mask,
571
+ causal_attention_mask,
572
+ output_attentions=output_attentions,
573
+ )
574
+
575
+ hidden_states = layer_outputs[0]
576
+
577
+ if output_attentions:
578
+ all_attentions = all_attentions + (layer_outputs[1],)
579
+
580
+ if output_hidden_states:
581
+ encoder_states = encoder_states + (hidden_states,)
582
+
583
+ if not return_dict:
584
+ return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
585
+ return BaseModelOutput(
586
+ last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
587
+ )
588
+
589
+
590
+ class EvaCLIPTextTransformer(EvaCLIPPreTrainedModel):
591
+ def __init__(self, config: EvaCLIPTextConfig):
592
+ super().__init__(config)
593
+ self.config = config
594
+ embed_dim = config.hidden_size
595
+ norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
596
+ self.embeddings = EvaCLIPTextEmbeddings(config)
597
+ self.encoder = EvaCLIPEncoder(config)
598
+ self.final_layer_norm = norm_layer(embed_dim, eps=config.layer_norm_eps)
599
+
600
+ def gradient_checkpointing_enable(self):
601
+ self.encoder.gradient_checkpointing = True
602
+
603
+ def forward(
604
+ self,
605
+ input_ids: Optional[torch.Tensor] = None,
606
+ attention_mask: Optional[torch.Tensor] = None,
607
+ position_ids: Optional[torch.Tensor] = None,
608
+ output_attentions: Optional[bool] = None,
609
+ output_hidden_states: Optional[bool] = None,
610
+ return_dict: Optional[bool] = None,
611
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
612
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
613
+ output_hidden_states = (
614
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
615
+ )
616
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
617
+
618
+ if input_ids is None:
619
+ raise ValueError("You have to specify input_ids")
620
+
621
+ input_shape = input_ids.size()
622
+ input_ids = input_ids.view(-1, input_shape[-1])
623
+
624
+ hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids)
625
+
626
+ bsz, seq_len = input_shape
627
+ # CLIP's text model uses causal mask, prepare it here.
628
+ # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324
629
+ causal_attention_mask = self._build_causal_attention_mask(bsz, seq_len, hidden_states.dtype).to(
630
+ hidden_states.device
631
+ )
632
+ # expand attention_mask
633
+ if attention_mask is not None:
634
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
635
+ attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
636
+
637
+ encoder_outputs = self.encoder(
638
+ inputs_embeds=hidden_states,
639
+ attention_mask=attention_mask,
640
+ causal_attention_mask=causal_attention_mask,
641
+ output_attentions=output_attentions,
642
+ output_hidden_states=output_hidden_states,
643
+ return_dict=return_dict,
644
+ )
645
+
646
+ last_hidden_state = encoder_outputs[0]
647
+ last_hidden_state = self.final_layer_norm(last_hidden_state)
648
+
649
+ # text_embeds.shape = [batch_size, sequence_length, transformer.width]
650
+ # take features from the eot embedding (eot_token is the highest number in each sequence)
651
+ # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14
652
+ pooled_output = last_hidden_state[
653
+ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
654
+ input_ids.to(dtype=torch.int, device=last_hidden_state.device).argmax(dim=-1),
655
+ ]
656
+
657
+ if not return_dict:
658
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
659
+
660
+ return BaseModelOutputWithPooling(
661
+ last_hidden_state=last_hidden_state,
662
+ pooler_output=pooled_output,
663
+ hidden_states=encoder_outputs.hidden_states,
664
+ attentions=encoder_outputs.attentions,
665
+ )
666
+
667
+ def _build_causal_attention_mask(self, bsz, seq_len, dtype):
668
+ # lazily create causal attention mask, with full attention between the vision tokens
669
+ # pytorch uses additive attention mask; fill with -inf
670
+ mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype)
671
+ mask.fill_(torch.tensor(torch.finfo(dtype).min))
672
+ mask.triu_(1) # zero out the lower diagonal
673
+ mask = mask.unsqueeze(1) # expand mask
674
+ return mask
675
+
676
+ class EvaCLIPTextModel(EvaCLIPPreTrainedModel):
677
+ config_class = EvaCLIPTextConfig
678
+
679
+ _no_split_modules = ["EvaCLIPEncoderLayer"]
680
+
681
+ def __init__(self, config: EvaCLIPTextConfig):
682
+ super().__init__(config)
683
+ self.text_model = EvaCLIPTextTransformer(config)
684
+ # Initialize weights and apply final processing
685
+ self.post_init()
686
+
687
+ def get_input_embeddings(self) -> nn.Module:
688
+ return self.text_model.embeddings.token_embedding
689
+
690
+ def set_input_embeddings(self, value):
691
+ self.text_model.embeddings.token_embedding = value
692
+
693
+ def forward(
694
+ self,
695
+ input_ids: Optional[torch.Tensor] = None,
696
+ attention_mask: Optional[torch.Tensor] = None,
697
+ position_ids: Optional[torch.Tensor] = None,
698
+ output_attentions: Optional[bool] = None,
699
+ output_hidden_states: Optional[bool] = None,
700
+ return_dict: Optional[bool] = None,
701
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
702
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
703
+
704
+ return self.text_model(
705
+ input_ids=input_ids,
706
+ attention_mask=attention_mask,
707
+ position_ids=position_ids,
708
+ output_attentions=output_attentions,
709
+ output_hidden_states=output_hidden_states,
710
+ return_dict=return_dict,
711
+ )
712
+
713
+
714
+ class EvaCLIPVisionTransformer(EvaCLIPPreTrainedModel):
715
+ def __init__(self, config: EvaCLIPVisionConfig):
716
+ super().__init__(config)
717
+ self.config = config
718
+ embed_dim = config.hidden_size
719
+ norm_layer = RMSNorm if config.use_rms_norm else nn.LayerNorm
720
+ self.embeddings = EvaCLIPVisionEmbeddings(config)
721
+ self.encoder = EvaCLIPEncoder(config)
722
+ self.post_layernorm = norm_layer(embed_dim, eps=config.layer_norm_eps)
723
+
724
+ def gradient_checkpointing_enable(self):
725
+ self.encoder.gradient_checkpointing = True
726
+
727
+ def forward(
728
+ self,
729
+ pixel_values: Optional[torch.FloatTensor] = None,
730
+ output_attentions: Optional[bool] = None,
731
+ output_hidden_states: Optional[bool] = None,
732
+ return_dict: Optional[bool] = None,
733
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
734
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
735
+ output_hidden_states = (
736
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
737
+ )
738
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
739
+
740
+ if pixel_values is None:
741
+ raise ValueError("You have to specify pixel_values")
742
+
743
+ hidden_states = self.embeddings(pixel_values)
744
+
745
+ encoder_outputs = self.encoder(
746
+ inputs_embeds=hidden_states,
747
+ output_attentions=output_attentions,
748
+ output_hidden_states=output_hidden_states,
749
+ return_dict=return_dict,
750
+ )
751
+
752
+ last_hidden_state = encoder_outputs[0]
753
+ pooled_output = last_hidden_state[:, 0, :]
754
+ pooled_output = self.post_layernorm(pooled_output)
755
+
756
+ if not return_dict:
757
+ return (last_hidden_state, pooled_output) + encoder_outputs[1:]
758
+
759
+ return BaseModelOutputWithPooling(
760
+ last_hidden_state=last_hidden_state,
761
+ pooler_output=pooled_output,
762
+ hidden_states=encoder_outputs.hidden_states,
763
+ attentions=encoder_outputs.attentions,
764
+ )
765
+
766
+ class EvaCLIPVisionModel(nn.Module):
767
+ config_class = EvaCLIPVisionConfig
768
+ main_input_name = "pixel_values"
769
+
770
+ def __init__(self, config: EvaCLIPVisionConfig):
771
+ super().__init__(config)
772
+ # super().__init__()
773
+ self.vision_model = EvaCLIPVisionTransformer(config)
774
+ # Initialize weights and apply final processing
775
+ self.post_init()
776
+
777
+ def get_input_embeddings(self) -> nn.Module:
778
+ return self.vision_model.embeddings.patch_embedding
779
+
780
+ def forward(
781
+ self,
782
+ pixel_values: Optional[torch.FloatTensor] = None,
783
+ output_attentions: Optional[bool] = None,
784
+ output_hidden_states: Optional[bool] = None,
785
+ return_dict: Optional[bool] = None,
786
+ ) -> Union[Tuple, BaseModelOutputWithPooling]:
787
+
788
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
789
+
790
+ return self.vision_model(
791
+ pixel_values=pixel_values,
792
+ output_attentions=output_attentions,
793
+ output_hidden_states=output_hidden_states,
794
+ return_dict=return_dict,
795
+ )
796
+
797
+
798
+ class EvaCLIPModel(EvaCLIPPreTrainedModel):
799
+ config_class = EvaCLIPConfig
800
+
801
+ def __init__(self, config: EvaCLIPConfig):
802
+ super().__init__(config)
803
+
804
+ if not (type(config.text_config).__name__ == "EvaCLIPTextConfig"):
805
+ raise ValueError(
806
+ "config.text_config is expected to be of type EvaCLIPTextConfig but is of type"
807
+ f" {type(config.text_config)}."
808
+ )
809
+
810
+ if not (type(config.vision_config).__name__ == "EvaCLIPVisionConfig"):
811
+ raise ValueError(
812
+ "config.vision_config is expected to be of type EvaCLIPVisionConfig but is of type"
813
+ f" {type(config.vision_config)}."
814
+ )
815
+
816
+ text_config = config.text_config
817
+ vision_config = config.vision_config
818
+
819
+ self.projection_dim = config.projection_dim
820
+ self.text_embed_dim = text_config.hidden_size
821
+ self.vision_embed_dim = vision_config.hidden_size
822
+
823
+ self.text_model = EvaCLIPTextTransformer(text_config)
824
+ self.vision_model = EvaCLIPVisionTransformer(vision_config)
825
+
826
+ self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
827
+ self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
828
+ self.logit_scale = torch.tensor(100., requires_grad=False)
829
+
830
+ # Initialize weights and apply final processing
831
+ self.post_init()
832
+
833
+ def encode_text(
834
+ self,
835
+ input_ids: Optional[torch.Tensor] = None,
836
+ attention_mask: Optional[torch.Tensor] = None,
837
+ position_ids: Optional[torch.Tensor] = None,
838
+ output_attentions: Optional[bool] = None,
839
+ output_hidden_states: Optional[bool] = None,
840
+ return_dict: Optional[bool] = None,
841
+ ) -> torch.FloatTensor:
842
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
843
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
844
+ output_hidden_states = (
845
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
846
+ )
847
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
848
+
849
+ text_outputs = self.text_model(
850
+ input_ids=input_ids,
851
+ attention_mask=attention_mask,
852
+ position_ids=position_ids,
853
+ output_attentions=output_attentions,
854
+ output_hidden_states=output_hidden_states,
855
+ return_dict=return_dict,
856
+ )
857
+
858
+ pooled_output = text_outputs[1]
859
+ text_features = self.text_projection(pooled_output)
860
+
861
+ return text_features
862
+
863
+ def encode_image(
864
+ self,
865
+ pixel_values: Optional[torch.FloatTensor] = None,
866
+ output_attentions: Optional[bool] = None,
867
+ output_hidden_states: Optional[bool] = None,
868
+ return_dict: Optional[bool] = None,
869
+ ) -> torch.FloatTensor:
870
+
871
+ # Use EvaCLIP model's config for some fields (if specified) instead of those of vision & text components.
872
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
873
+ output_hidden_states = (
874
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
875
+ )
876
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
877
+
878
+ vision_outputs = self.vision_model(
879
+ pixel_values=pixel_values,
880
+ output_attentions=output_attentions,
881
+ output_hidden_states=output_hidden_states,
882
+ return_dict=return_dict,
883
+ )
884
+
885
+ pooled_output = vision_outputs[1] # pooled_output
886
+ image_features = self.visual_projection(pooled_output)
887
+
888
+ return image_features
889
+
890
+ def forward(
891
+ self,
892
+ input_ids: Optional[torch.LongTensor] = None,
893
+ pixel_values: Optional[torch.FloatTensor] = None,
894
+ attention_mask: Optional[torch.Tensor] = None,
895
+ position_ids: Optional[torch.LongTensor] = None,
896
+ return_loss: Optional[bool] = None,
897
+ output_attentions: Optional[bool] = None,
898
+ output_hidden_states: Optional[bool] = None,
899
+ return_dict: Optional[bool] = None,
900
+ ) -> Union[Tuple, EvaCLIPOutput]:
901
+ # Use CLIP model's config for some fields (if specified) instead of those of vision & text components.
902
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
903
+ output_hidden_states = (
904
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
905
+ )
906
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
907
+
908
+ vision_outputs = self.vision_model(
909
+ pixel_values=pixel_values,
910
+ output_attentions=output_attentions,
911
+ output_hidden_states=output_hidden_states,
912
+ return_dict=return_dict,
913
+ )
914
+
915
+ text_outputs = self.text_model(
916
+ input_ids=input_ids,
917
+ attention_mask=attention_mask,
918
+ position_ids=position_ids,
919
+ output_attentions=output_attentions,
920
+ output_hidden_states=output_hidden_states,
921
+ return_dict=return_dict,
922
+ )
923
+
924
+ image_embeds = vision_outputs[1]
925
+ image_embeds = self.visual_projection(image_embeds)
926
+
927
+ text_embeds = text_outputs[1]
928
+ text_embeds = self.text_projection(text_embeds)
929
+
930
+ # normalized features
931
+ image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
932
+ text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
933
+
934
+ # cosine similarity as logits
935
+ logit_scale = self.logit_scale.exp()
936
+ logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
937
+ logits_per_image = logits_per_text.t()
938
+
939
+ loss = None
940
+ if return_loss:
941
+ loss = clip_loss(logits_per_text)
942
+
943
+ if not return_dict:
944
+ output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
945
+ return ((loss,) + output) if loss is not None else output
946
+
947
+ return EvaCLIPOutput(
948
+ loss=loss,
949
+ logits_per_image=logits_per_image,
950
+ logits_per_text=logits_per_text,
951
+ text_embeds=text_embeds,
952
+ image_embeds=image_embeds,
953
+ text_model_output=text_outputs,
954
+ vision_model_output=vision_outputs,
955
+ )
956
+
957
+
958
+ class EvaCLIPTextModelWithProjection(EvaCLIPPreTrainedModel):
959
+ config_class = EvaCLIPTextConfig
960
+
961
+ _no_split_modules = ["EvaCLIPEncoderLayer"]
962
+
963
+ def __init__(self, config: EvaCLIPTextConfig):
964
+ super().__init__(config)
965
+
966
+ self.text_model = EvaCLIPTextTransformer(config)
967
+
968
+ self.text_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
969
+
970
+ def get_input_embeddings(self) -> nn.Module:
971
+ return self.text_model.embeddings.token_embedding
972
+
973
+ def set_input_embeddings(self, value):
974
+ self.text_model.embeddings.token_embedding = value
975
+
976
+ def forward(
977
+ self,
978
+ input_ids: Optional[torch.Tensor] = None,
979
+ attention_mask: Optional[torch.Tensor] = None,
980
+ position_ids: Optional[torch.Tensor] = None,
981
+ output_attentions: Optional[bool] = None,
982
+ output_hidden_states: Optional[bool] = None,
983
+ return_dict: Optional[bool] = None,
984
+ ) -> Union[Tuple, EvaCLIPTextModelOutput]:
985
+
986
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
987
+
988
+ text_outputs = self.text_model(
989
+ input_ids=input_ids,
990
+ attention_mask=attention_mask,
991
+ position_ids=position_ids,
992
+ output_attentions=output_attentions,
993
+ output_hidden_states=output_hidden_states,
994
+ return_dict=return_dict,
995
+ )
996
+
997
+ pooled_output = text_outputs[1]
998
+
999
+ text_embeds = self.text_projection(pooled_output)
1000
+
1001
+ if not return_dict:
1002
+ outputs = (text_embeds, text_outputs[0]) + text_outputs[2:]
1003
+ return tuple(output for output in outputs if output is not None)
1004
+
1005
+ return EvaCLIPTextModelOutput(
1006
+ text_embeds=text_embeds,
1007
+ last_hidden_state=text_outputs.last_hidden_state,
1008
+ hidden_states=text_outputs.hidden_states,
1009
+ attentions=text_outputs.attentions,
1010
+ )
1011
+
1012
+ class EvaCLIPVisionModelWithProjection(EvaCLIPPreTrainedModel):
1013
+ config_class = EvaCLIPVisionConfig
1014
+ main_input_name = "pixel_values"
1015
+
1016
+ def __init__(self, config: EvaCLIPVisionConfig):
1017
+ super().__init__(config)
1018
+
1019
+ self.vision_model = EvaCLIPVisionTransformer(config)
1020
+
1021
+ self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
1022
+
1023
+ # Initialize weights and apply final processing
1024
+ self.post_init()
1025
+
1026
+ def get_input_embeddings(self) -> nn.Module:
1027
+ return self.vision_model.embeddings.patch_embedding
1028
+
1029
+ def forward(
1030
+ self,
1031
+ pixel_values: Optional[torch.FloatTensor] = None,
1032
+ output_attentions: Optional[bool] = None,
1033
+ output_hidden_states: Optional[bool] = None,
1034
+ return_dict: Optional[bool] = None,
1035
+ ) -> Union[Tuple, EvaCLIPVisionModelOutput]:
1036
+
1037
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1038
+
1039
+ vision_outputs = self.vision_model(
1040
+ pixel_values=pixel_values,
1041
+ output_attentions=output_attentions,
1042
+ output_hidden_states=output_hidden_states,
1043
+ return_dict=return_dict,
1044
+ )
1045
+
1046
+ pooled_output = vision_outputs[1] # pooled_output
1047
+
1048
+ image_embeds = self.visual_projection(pooled_output)
1049
+
1050
+ if not return_dict:
1051
+ outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]
1052
+ return tuple(output for output in outputs if output is not None)
1053
+
1054
+ return EvaCLIPVisionModelOutput(
1055
+ image_embeds=image_embeds,
1056
+ last_hidden_state=vision_outputs.last_hidden_state,
1057
+ hidden_states=vision_outputs.hidden_states,
1058
+ attentions=vision_outputs.attentions,
1059
+ )
pytorch_model.bin.index.json ADDED
@@ -0,0 +1,882 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 32882994176
4
+ },
5
+ "weight_map": {
6
+ "text_model.embeddings.position_embedding.weight": "pytorch_model-00001-of-00004.bin",
7
+ "text_model.embeddings.token_embedding.weight": "pytorch_model-00001-of-00004.bin",
8
+ "text_model.encoder.layers.0.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
9
+ "text_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
10
+ "text_model.encoder.layers.0.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
11
+ "text_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
12
+ "text_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
13
+ "text_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
14
+ "text_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
15
+ "text_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
16
+ "text_model.encoder.layers.0.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
17
+ "text_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
18
+ "text_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
19
+ "text_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
20
+ "text_model.encoder.layers.0.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
21
+ "text_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
22
+ "text_model.encoder.layers.0.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
23
+ "text_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
24
+ "text_model.encoder.layers.1.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
25
+ "text_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
26
+ "text_model.encoder.layers.1.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
27
+ "text_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
28
+ "text_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
29
+ "text_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
30
+ "text_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
31
+ "text_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
32
+ "text_model.encoder.layers.1.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
33
+ "text_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
34
+ "text_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
35
+ "text_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
36
+ "text_model.encoder.layers.1.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
37
+ "text_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
38
+ "text_model.encoder.layers.1.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
39
+ "text_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
40
+ "text_model.encoder.layers.10.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
41
+ "text_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
42
+ "text_model.encoder.layers.10.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
43
+ "text_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
44
+ "text_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
45
+ "text_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
46
+ "text_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
47
+ "text_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
48
+ "text_model.encoder.layers.10.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
49
+ "text_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
50
+ "text_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
51
+ "text_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
52
+ "text_model.encoder.layers.10.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
53
+ "text_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
54
+ "text_model.encoder.layers.10.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
55
+ "text_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
56
+ "text_model.encoder.layers.11.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
57
+ "text_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
58
+ "text_model.encoder.layers.11.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
59
+ "text_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
60
+ "text_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
61
+ "text_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
62
+ "text_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
63
+ "text_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
64
+ "text_model.encoder.layers.11.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
65
+ "text_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
66
+ "text_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
67
+ "text_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
68
+ "text_model.encoder.layers.11.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
69
+ "text_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
70
+ "text_model.encoder.layers.11.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
71
+ "text_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
72
+ "text_model.encoder.layers.12.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
73
+ "text_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
74
+ "text_model.encoder.layers.12.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
75
+ "text_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
76
+ "text_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
77
+ "text_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
78
+ "text_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
79
+ "text_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
80
+ "text_model.encoder.layers.12.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
81
+ "text_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
82
+ "text_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
83
+ "text_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
84
+ "text_model.encoder.layers.12.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
85
+ "text_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
86
+ "text_model.encoder.layers.12.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
87
+ "text_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
88
+ "text_model.encoder.layers.13.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
89
+ "text_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
90
+ "text_model.encoder.layers.13.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
91
+ "text_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
92
+ "text_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
93
+ "text_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
94
+ "text_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
95
+ "text_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
96
+ "text_model.encoder.layers.13.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
97
+ "text_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
98
+ "text_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
99
+ "text_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
100
+ "text_model.encoder.layers.13.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
101
+ "text_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
102
+ "text_model.encoder.layers.13.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
103
+ "text_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
104
+ "text_model.encoder.layers.14.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
105
+ "text_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
106
+ "text_model.encoder.layers.14.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
107
+ "text_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
108
+ "text_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
109
+ "text_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
110
+ "text_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
111
+ "text_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
112
+ "text_model.encoder.layers.14.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
113
+ "text_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
114
+ "text_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
115
+ "text_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
116
+ "text_model.encoder.layers.14.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
117
+ "text_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
118
+ "text_model.encoder.layers.14.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
119
+ "text_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
120
+ "text_model.encoder.layers.15.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
121
+ "text_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
122
+ "text_model.encoder.layers.15.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
123
+ "text_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
124
+ "text_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
125
+ "text_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
126
+ "text_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
127
+ "text_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
128
+ "text_model.encoder.layers.15.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
129
+ "text_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
130
+ "text_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
131
+ "text_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
132
+ "text_model.encoder.layers.15.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
133
+ "text_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
134
+ "text_model.encoder.layers.15.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
135
+ "text_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
136
+ "text_model.encoder.layers.16.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
137
+ "text_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
138
+ "text_model.encoder.layers.16.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
139
+ "text_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
140
+ "text_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
141
+ "text_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
142
+ "text_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
143
+ "text_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
144
+ "text_model.encoder.layers.16.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
145
+ "text_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
146
+ "text_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
147
+ "text_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
148
+ "text_model.encoder.layers.16.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
149
+ "text_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
150
+ "text_model.encoder.layers.16.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
151
+ "text_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
152
+ "text_model.encoder.layers.17.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
153
+ "text_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
154
+ "text_model.encoder.layers.17.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
155
+ "text_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
156
+ "text_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
157
+ "text_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
158
+ "text_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
159
+ "text_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
160
+ "text_model.encoder.layers.17.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
161
+ "text_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
162
+ "text_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
163
+ "text_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
164
+ "text_model.encoder.layers.17.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
165
+ "text_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
166
+ "text_model.encoder.layers.17.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
167
+ "text_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
168
+ "text_model.encoder.layers.18.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
169
+ "text_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
170
+ "text_model.encoder.layers.18.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
171
+ "text_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
172
+ "text_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
173
+ "text_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
174
+ "text_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
175
+ "text_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
176
+ "text_model.encoder.layers.18.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
177
+ "text_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
178
+ "text_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
179
+ "text_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
180
+ "text_model.encoder.layers.18.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
181
+ "text_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
182
+ "text_model.encoder.layers.18.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
183
+ "text_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
184
+ "text_model.encoder.layers.19.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
185
+ "text_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
186
+ "text_model.encoder.layers.19.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
187
+ "text_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
188
+ "text_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
189
+ "text_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
190
+ "text_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
191
+ "text_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
192
+ "text_model.encoder.layers.19.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
193
+ "text_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
194
+ "text_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
195
+ "text_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
196
+ "text_model.encoder.layers.19.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
197
+ "text_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
198
+ "text_model.encoder.layers.19.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
199
+ "text_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
200
+ "text_model.encoder.layers.2.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
201
+ "text_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
202
+ "text_model.encoder.layers.2.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
203
+ "text_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
204
+ "text_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
205
+ "text_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
206
+ "text_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
207
+ "text_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
208
+ "text_model.encoder.layers.2.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
209
+ "text_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
210
+ "text_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
211
+ "text_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
212
+ "text_model.encoder.layers.2.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
213
+ "text_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
214
+ "text_model.encoder.layers.2.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
215
+ "text_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
216
+ "text_model.encoder.layers.20.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
217
+ "text_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
218
+ "text_model.encoder.layers.20.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
219
+ "text_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
220
+ "text_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
221
+ "text_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
222
+ "text_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
223
+ "text_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
224
+ "text_model.encoder.layers.20.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
225
+ "text_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
226
+ "text_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
227
+ "text_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
228
+ "text_model.encoder.layers.20.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
229
+ "text_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
230
+ "text_model.encoder.layers.20.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
231
+ "text_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
232
+ "text_model.encoder.layers.21.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
233
+ "text_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
234
+ "text_model.encoder.layers.21.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
235
+ "text_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
236
+ "text_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
237
+ "text_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
238
+ "text_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
239
+ "text_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
240
+ "text_model.encoder.layers.21.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
241
+ "text_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
242
+ "text_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
243
+ "text_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
244
+ "text_model.encoder.layers.21.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
245
+ "text_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
246
+ "text_model.encoder.layers.21.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
247
+ "text_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
248
+ "text_model.encoder.layers.22.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
249
+ "text_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
250
+ "text_model.encoder.layers.22.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
251
+ "text_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
252
+ "text_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
253
+ "text_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
254
+ "text_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
255
+ "text_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
256
+ "text_model.encoder.layers.22.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
257
+ "text_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
258
+ "text_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
259
+ "text_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
260
+ "text_model.encoder.layers.22.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
261
+ "text_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
262
+ "text_model.encoder.layers.22.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
263
+ "text_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
264
+ "text_model.encoder.layers.23.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
265
+ "text_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
266
+ "text_model.encoder.layers.23.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
267
+ "text_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
268
+ "text_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
269
+ "text_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
270
+ "text_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
271
+ "text_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
272
+ "text_model.encoder.layers.23.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
273
+ "text_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
274
+ "text_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
275
+ "text_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
276
+ "text_model.encoder.layers.23.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
277
+ "text_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
278
+ "text_model.encoder.layers.23.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
279
+ "text_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
280
+ "text_model.encoder.layers.24.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
281
+ "text_model.encoder.layers.24.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
282
+ "text_model.encoder.layers.24.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
283
+ "text_model.encoder.layers.24.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
284
+ "text_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
285
+ "text_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
286
+ "text_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
287
+ "text_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
288
+ "text_model.encoder.layers.24.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
289
+ "text_model.encoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
290
+ "text_model.encoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
291
+ "text_model.encoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
292
+ "text_model.encoder.layers.24.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
293
+ "text_model.encoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
294
+ "text_model.encoder.layers.24.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
295
+ "text_model.encoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
296
+ "text_model.encoder.layers.25.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
297
+ "text_model.encoder.layers.25.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
298
+ "text_model.encoder.layers.25.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
299
+ "text_model.encoder.layers.25.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
300
+ "text_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
301
+ "text_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
302
+ "text_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
303
+ "text_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
304
+ "text_model.encoder.layers.25.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
305
+ "text_model.encoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
306
+ "text_model.encoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
307
+ "text_model.encoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
308
+ "text_model.encoder.layers.25.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
309
+ "text_model.encoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
310
+ "text_model.encoder.layers.25.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
311
+ "text_model.encoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
312
+ "text_model.encoder.layers.26.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
313
+ "text_model.encoder.layers.26.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
314
+ "text_model.encoder.layers.26.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
315
+ "text_model.encoder.layers.26.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
316
+ "text_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
317
+ "text_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
318
+ "text_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
319
+ "text_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
320
+ "text_model.encoder.layers.26.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
321
+ "text_model.encoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
322
+ "text_model.encoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
323
+ "text_model.encoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
324
+ "text_model.encoder.layers.26.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
325
+ "text_model.encoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
326
+ "text_model.encoder.layers.26.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
327
+ "text_model.encoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
328
+ "text_model.encoder.layers.27.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
329
+ "text_model.encoder.layers.27.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
330
+ "text_model.encoder.layers.27.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
331
+ "text_model.encoder.layers.27.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
332
+ "text_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
333
+ "text_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
334
+ "text_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
335
+ "text_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
336
+ "text_model.encoder.layers.27.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
337
+ "text_model.encoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
338
+ "text_model.encoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
339
+ "text_model.encoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
340
+ "text_model.encoder.layers.27.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
341
+ "text_model.encoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
342
+ "text_model.encoder.layers.27.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
343
+ "text_model.encoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
344
+ "text_model.encoder.layers.28.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
345
+ "text_model.encoder.layers.28.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
346
+ "text_model.encoder.layers.28.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
347
+ "text_model.encoder.layers.28.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
348
+ "text_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
349
+ "text_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
350
+ "text_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
351
+ "text_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
352
+ "text_model.encoder.layers.28.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
353
+ "text_model.encoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
354
+ "text_model.encoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
355
+ "text_model.encoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
356
+ "text_model.encoder.layers.28.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
357
+ "text_model.encoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
358
+ "text_model.encoder.layers.28.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
359
+ "text_model.encoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
360
+ "text_model.encoder.layers.29.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
361
+ "text_model.encoder.layers.29.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
362
+ "text_model.encoder.layers.29.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
363
+ "text_model.encoder.layers.29.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
364
+ "text_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
365
+ "text_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
366
+ "text_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
367
+ "text_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
368
+ "text_model.encoder.layers.29.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
369
+ "text_model.encoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
370
+ "text_model.encoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
371
+ "text_model.encoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
372
+ "text_model.encoder.layers.29.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
373
+ "text_model.encoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
374
+ "text_model.encoder.layers.29.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
375
+ "text_model.encoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
376
+ "text_model.encoder.layers.3.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
377
+ "text_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
378
+ "text_model.encoder.layers.3.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
379
+ "text_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
380
+ "text_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
381
+ "text_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
382
+ "text_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
383
+ "text_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
384
+ "text_model.encoder.layers.3.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
385
+ "text_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
386
+ "text_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
387
+ "text_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
388
+ "text_model.encoder.layers.3.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
389
+ "text_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
390
+ "text_model.encoder.layers.3.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
391
+ "text_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
392
+ "text_model.encoder.layers.30.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
393
+ "text_model.encoder.layers.30.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
394
+ "text_model.encoder.layers.30.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
395
+ "text_model.encoder.layers.30.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
396
+ "text_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
397
+ "text_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
398
+ "text_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
399
+ "text_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
400
+ "text_model.encoder.layers.30.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
401
+ "text_model.encoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
402
+ "text_model.encoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
403
+ "text_model.encoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
404
+ "text_model.encoder.layers.30.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
405
+ "text_model.encoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
406
+ "text_model.encoder.layers.30.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
407
+ "text_model.encoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
408
+ "text_model.encoder.layers.31.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
409
+ "text_model.encoder.layers.31.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
410
+ "text_model.encoder.layers.31.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
411
+ "text_model.encoder.layers.31.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
412
+ "text_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
413
+ "text_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
414
+ "text_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
415
+ "text_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
416
+ "text_model.encoder.layers.31.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
417
+ "text_model.encoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
418
+ "text_model.encoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
419
+ "text_model.encoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
420
+ "text_model.encoder.layers.31.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
421
+ "text_model.encoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
422
+ "text_model.encoder.layers.31.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
423
+ "text_model.encoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
424
+ "text_model.encoder.layers.4.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
425
+ "text_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
426
+ "text_model.encoder.layers.4.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
427
+ "text_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
428
+ "text_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
429
+ "text_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
430
+ "text_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
431
+ "text_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
432
+ "text_model.encoder.layers.4.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
433
+ "text_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
434
+ "text_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
435
+ "text_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
436
+ "text_model.encoder.layers.4.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
437
+ "text_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
438
+ "text_model.encoder.layers.4.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
439
+ "text_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
440
+ "text_model.encoder.layers.5.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
441
+ "text_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
442
+ "text_model.encoder.layers.5.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
443
+ "text_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
444
+ "text_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
445
+ "text_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
446
+ "text_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
447
+ "text_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
448
+ "text_model.encoder.layers.5.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
449
+ "text_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
450
+ "text_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
451
+ "text_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
452
+ "text_model.encoder.layers.5.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
453
+ "text_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
454
+ "text_model.encoder.layers.5.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
455
+ "text_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
456
+ "text_model.encoder.layers.6.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
457
+ "text_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
458
+ "text_model.encoder.layers.6.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
459
+ "text_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
460
+ "text_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
461
+ "text_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
462
+ "text_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
463
+ "text_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
464
+ "text_model.encoder.layers.6.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
465
+ "text_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
466
+ "text_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
467
+ "text_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
468
+ "text_model.encoder.layers.6.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
469
+ "text_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
470
+ "text_model.encoder.layers.6.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
471
+ "text_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
472
+ "text_model.encoder.layers.7.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
473
+ "text_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
474
+ "text_model.encoder.layers.7.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
475
+ "text_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
476
+ "text_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
477
+ "text_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
478
+ "text_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
479
+ "text_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
480
+ "text_model.encoder.layers.7.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
481
+ "text_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
482
+ "text_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
483
+ "text_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
484
+ "text_model.encoder.layers.7.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
485
+ "text_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
486
+ "text_model.encoder.layers.7.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
487
+ "text_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
488
+ "text_model.encoder.layers.8.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
489
+ "text_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
490
+ "text_model.encoder.layers.8.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
491
+ "text_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
492
+ "text_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
493
+ "text_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
494
+ "text_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
495
+ "text_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
496
+ "text_model.encoder.layers.8.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
497
+ "text_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
498
+ "text_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
499
+ "text_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
500
+ "text_model.encoder.layers.8.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
501
+ "text_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
502
+ "text_model.encoder.layers.8.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
503
+ "text_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
504
+ "text_model.encoder.layers.9.layer_norm1.bias": "pytorch_model-00001-of-00004.bin",
505
+ "text_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
506
+ "text_model.encoder.layers.9.layer_norm2.bias": "pytorch_model-00001-of-00004.bin",
507
+ "text_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
508
+ "text_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
509
+ "text_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
510
+ "text_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
511
+ "text_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
512
+ "text_model.encoder.layers.9.self_attn.k_proj.bias": "pytorch_model-00001-of-00004.bin",
513
+ "text_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
514
+ "text_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
515
+ "text_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
516
+ "text_model.encoder.layers.9.self_attn.q_proj.bias": "pytorch_model-00001-of-00004.bin",
517
+ "text_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
518
+ "text_model.encoder.layers.9.self_attn.v_proj.bias": "pytorch_model-00001-of-00004.bin",
519
+ "text_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
520
+ "text_model.final_layer_norm.bias": "pytorch_model-00001-of-00004.bin",
521
+ "text_model.final_layer_norm.weight": "pytorch_model-00001-of-00004.bin",
522
+ "text_projection.weight": "pytorch_model-00004-of-00004.bin",
523
+ "vision_model.embeddings.class_embedding": "pytorch_model-00001-of-00004.bin",
524
+ "vision_model.embeddings.patch_embedding.bias": "pytorch_model-00001-of-00004.bin",
525
+ "vision_model.embeddings.patch_embedding.weight": "pytorch_model-00001-of-00004.bin",
526
+ "vision_model.embeddings.position_embedding.weight": "pytorch_model-00001-of-00004.bin",
527
+ "vision_model.encoder.layers.0.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
528
+ "vision_model.encoder.layers.0.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
529
+ "vision_model.encoder.layers.0.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
530
+ "vision_model.encoder.layers.0.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
531
+ "vision_model.encoder.layers.0.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
532
+ "vision_model.encoder.layers.0.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
533
+ "vision_model.encoder.layers.0.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
534
+ "vision_model.encoder.layers.0.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
535
+ "vision_model.encoder.layers.0.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
536
+ "vision_model.encoder.layers.0.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
537
+ "vision_model.encoder.layers.0.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
538
+ "vision_model.encoder.layers.1.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
539
+ "vision_model.encoder.layers.1.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
540
+ "vision_model.encoder.layers.1.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
541
+ "vision_model.encoder.layers.1.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
542
+ "vision_model.encoder.layers.1.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
543
+ "vision_model.encoder.layers.1.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
544
+ "vision_model.encoder.layers.1.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
545
+ "vision_model.encoder.layers.1.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
546
+ "vision_model.encoder.layers.1.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
547
+ "vision_model.encoder.layers.1.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
548
+ "vision_model.encoder.layers.1.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
549
+ "vision_model.encoder.layers.10.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
550
+ "vision_model.encoder.layers.10.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
551
+ "vision_model.encoder.layers.10.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
552
+ "vision_model.encoder.layers.10.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
553
+ "vision_model.encoder.layers.10.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
554
+ "vision_model.encoder.layers.10.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
555
+ "vision_model.encoder.layers.10.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
556
+ "vision_model.encoder.layers.10.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
557
+ "vision_model.encoder.layers.10.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
558
+ "vision_model.encoder.layers.10.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
559
+ "vision_model.encoder.layers.10.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
560
+ "vision_model.encoder.layers.11.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
561
+ "vision_model.encoder.layers.11.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
562
+ "vision_model.encoder.layers.11.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
563
+ "vision_model.encoder.layers.11.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
564
+ "vision_model.encoder.layers.11.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
565
+ "vision_model.encoder.layers.11.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
566
+ "vision_model.encoder.layers.11.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
567
+ "vision_model.encoder.layers.11.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
568
+ "vision_model.encoder.layers.11.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
569
+ "vision_model.encoder.layers.11.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
570
+ "vision_model.encoder.layers.11.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
571
+ "vision_model.encoder.layers.12.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
572
+ "vision_model.encoder.layers.12.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
573
+ "vision_model.encoder.layers.12.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
574
+ "vision_model.encoder.layers.12.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
575
+ "vision_model.encoder.layers.12.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
576
+ "vision_model.encoder.layers.12.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
577
+ "vision_model.encoder.layers.12.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
578
+ "vision_model.encoder.layers.12.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
579
+ "vision_model.encoder.layers.12.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
580
+ "vision_model.encoder.layers.12.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
581
+ "vision_model.encoder.layers.12.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
582
+ "vision_model.encoder.layers.13.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
583
+ "vision_model.encoder.layers.13.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
584
+ "vision_model.encoder.layers.13.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
585
+ "vision_model.encoder.layers.13.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
586
+ "vision_model.encoder.layers.13.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
587
+ "vision_model.encoder.layers.13.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
588
+ "vision_model.encoder.layers.13.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
589
+ "vision_model.encoder.layers.13.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
590
+ "vision_model.encoder.layers.13.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
591
+ "vision_model.encoder.layers.13.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
592
+ "vision_model.encoder.layers.13.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
593
+ "vision_model.encoder.layers.14.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
594
+ "vision_model.encoder.layers.14.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
595
+ "vision_model.encoder.layers.14.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
596
+ "vision_model.encoder.layers.14.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
597
+ "vision_model.encoder.layers.14.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
598
+ "vision_model.encoder.layers.14.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
599
+ "vision_model.encoder.layers.14.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
600
+ "vision_model.encoder.layers.14.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
601
+ "vision_model.encoder.layers.14.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
602
+ "vision_model.encoder.layers.14.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
603
+ "vision_model.encoder.layers.14.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
604
+ "vision_model.encoder.layers.15.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
605
+ "vision_model.encoder.layers.15.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
606
+ "vision_model.encoder.layers.15.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
607
+ "vision_model.encoder.layers.15.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
608
+ "vision_model.encoder.layers.15.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
609
+ "vision_model.encoder.layers.15.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
610
+ "vision_model.encoder.layers.15.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
611
+ "vision_model.encoder.layers.15.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
612
+ "vision_model.encoder.layers.15.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
613
+ "vision_model.encoder.layers.15.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
614
+ "vision_model.encoder.layers.15.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
615
+ "vision_model.encoder.layers.16.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
616
+ "vision_model.encoder.layers.16.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
617
+ "vision_model.encoder.layers.16.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
618
+ "vision_model.encoder.layers.16.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
619
+ "vision_model.encoder.layers.16.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
620
+ "vision_model.encoder.layers.16.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
621
+ "vision_model.encoder.layers.16.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
622
+ "vision_model.encoder.layers.16.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
623
+ "vision_model.encoder.layers.16.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
624
+ "vision_model.encoder.layers.16.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
625
+ "vision_model.encoder.layers.16.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
626
+ "vision_model.encoder.layers.17.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
627
+ "vision_model.encoder.layers.17.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
628
+ "vision_model.encoder.layers.17.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
629
+ "vision_model.encoder.layers.17.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
630
+ "vision_model.encoder.layers.17.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
631
+ "vision_model.encoder.layers.17.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
632
+ "vision_model.encoder.layers.17.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
633
+ "vision_model.encoder.layers.17.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
634
+ "vision_model.encoder.layers.17.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
635
+ "vision_model.encoder.layers.17.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
636
+ "vision_model.encoder.layers.17.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
637
+ "vision_model.encoder.layers.18.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
638
+ "vision_model.encoder.layers.18.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
639
+ "vision_model.encoder.layers.18.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
640
+ "vision_model.encoder.layers.18.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
641
+ "vision_model.encoder.layers.18.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
642
+ "vision_model.encoder.layers.18.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
643
+ "vision_model.encoder.layers.18.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
644
+ "vision_model.encoder.layers.18.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
645
+ "vision_model.encoder.layers.18.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
646
+ "vision_model.encoder.layers.18.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
647
+ "vision_model.encoder.layers.18.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
648
+ "vision_model.encoder.layers.19.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
649
+ "vision_model.encoder.layers.19.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
650
+ "vision_model.encoder.layers.19.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
651
+ "vision_model.encoder.layers.19.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
652
+ "vision_model.encoder.layers.19.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
653
+ "vision_model.encoder.layers.19.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
654
+ "vision_model.encoder.layers.19.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
655
+ "vision_model.encoder.layers.19.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
656
+ "vision_model.encoder.layers.19.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
657
+ "vision_model.encoder.layers.19.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
658
+ "vision_model.encoder.layers.19.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
659
+ "vision_model.encoder.layers.2.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
660
+ "vision_model.encoder.layers.2.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
661
+ "vision_model.encoder.layers.2.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
662
+ "vision_model.encoder.layers.2.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
663
+ "vision_model.encoder.layers.2.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
664
+ "vision_model.encoder.layers.2.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
665
+ "vision_model.encoder.layers.2.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
666
+ "vision_model.encoder.layers.2.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
667
+ "vision_model.encoder.layers.2.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
668
+ "vision_model.encoder.layers.2.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
669
+ "vision_model.encoder.layers.2.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
670
+ "vision_model.encoder.layers.20.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
671
+ "vision_model.encoder.layers.20.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
672
+ "vision_model.encoder.layers.20.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
673
+ "vision_model.encoder.layers.20.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
674
+ "vision_model.encoder.layers.20.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
675
+ "vision_model.encoder.layers.20.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
676
+ "vision_model.encoder.layers.20.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
677
+ "vision_model.encoder.layers.20.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
678
+ "vision_model.encoder.layers.20.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
679
+ "vision_model.encoder.layers.20.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
680
+ "vision_model.encoder.layers.20.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
681
+ "vision_model.encoder.layers.21.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
682
+ "vision_model.encoder.layers.21.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
683
+ "vision_model.encoder.layers.21.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
684
+ "vision_model.encoder.layers.21.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
685
+ "vision_model.encoder.layers.21.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
686
+ "vision_model.encoder.layers.21.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
687
+ "vision_model.encoder.layers.21.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
688
+ "vision_model.encoder.layers.21.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
689
+ "vision_model.encoder.layers.21.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
690
+ "vision_model.encoder.layers.21.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
691
+ "vision_model.encoder.layers.21.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
692
+ "vision_model.encoder.layers.22.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
693
+ "vision_model.encoder.layers.22.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
694
+ "vision_model.encoder.layers.22.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
695
+ "vision_model.encoder.layers.22.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
696
+ "vision_model.encoder.layers.22.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
697
+ "vision_model.encoder.layers.22.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
698
+ "vision_model.encoder.layers.22.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
699
+ "vision_model.encoder.layers.22.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
700
+ "vision_model.encoder.layers.22.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
701
+ "vision_model.encoder.layers.22.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
702
+ "vision_model.encoder.layers.22.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
703
+ "vision_model.encoder.layers.23.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
704
+ "vision_model.encoder.layers.23.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
705
+ "vision_model.encoder.layers.23.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
706
+ "vision_model.encoder.layers.23.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
707
+ "vision_model.encoder.layers.23.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
708
+ "vision_model.encoder.layers.23.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
709
+ "vision_model.encoder.layers.23.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
710
+ "vision_model.encoder.layers.23.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
711
+ "vision_model.encoder.layers.23.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
712
+ "vision_model.encoder.layers.23.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
713
+ "vision_model.encoder.layers.23.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
714
+ "vision_model.encoder.layers.24.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
715
+ "vision_model.encoder.layers.24.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
716
+ "vision_model.encoder.layers.24.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
717
+ "vision_model.encoder.layers.24.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
718
+ "vision_model.encoder.layers.24.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
719
+ "vision_model.encoder.layers.24.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
720
+ "vision_model.encoder.layers.24.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
721
+ "vision_model.encoder.layers.24.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
722
+ "vision_model.encoder.layers.24.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
723
+ "vision_model.encoder.layers.24.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
724
+ "vision_model.encoder.layers.24.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
725
+ "vision_model.encoder.layers.25.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
726
+ "vision_model.encoder.layers.25.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
727
+ "vision_model.encoder.layers.25.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
728
+ "vision_model.encoder.layers.25.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
729
+ "vision_model.encoder.layers.25.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
730
+ "vision_model.encoder.layers.25.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
731
+ "vision_model.encoder.layers.25.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
732
+ "vision_model.encoder.layers.25.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
733
+ "vision_model.encoder.layers.25.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
734
+ "vision_model.encoder.layers.25.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
735
+ "vision_model.encoder.layers.25.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
736
+ "vision_model.encoder.layers.26.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
737
+ "vision_model.encoder.layers.26.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
738
+ "vision_model.encoder.layers.26.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
739
+ "vision_model.encoder.layers.26.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
740
+ "vision_model.encoder.layers.26.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
741
+ "vision_model.encoder.layers.26.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
742
+ "vision_model.encoder.layers.26.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
743
+ "vision_model.encoder.layers.26.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
744
+ "vision_model.encoder.layers.26.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
745
+ "vision_model.encoder.layers.26.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
746
+ "vision_model.encoder.layers.26.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
747
+ "vision_model.encoder.layers.27.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
748
+ "vision_model.encoder.layers.27.layer_norm2.weight": "pytorch_model-00003-of-00004.bin",
749
+ "vision_model.encoder.layers.27.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
750
+ "vision_model.encoder.layers.27.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
751
+ "vision_model.encoder.layers.27.mlp.fc2.bias": "pytorch_model-00003-of-00004.bin",
752
+ "vision_model.encoder.layers.27.mlp.fc2.weight": "pytorch_model-00003-of-00004.bin",
753
+ "vision_model.encoder.layers.27.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
754
+ "vision_model.encoder.layers.27.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
755
+ "vision_model.encoder.layers.27.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
756
+ "vision_model.encoder.layers.27.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
757
+ "vision_model.encoder.layers.27.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
758
+ "vision_model.encoder.layers.28.layer_norm1.weight": "pytorch_model-00003-of-00004.bin",
759
+ "vision_model.encoder.layers.28.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
760
+ "vision_model.encoder.layers.28.mlp.fc1.bias": "pytorch_model-00003-of-00004.bin",
761
+ "vision_model.encoder.layers.28.mlp.fc1.weight": "pytorch_model-00003-of-00004.bin",
762
+ "vision_model.encoder.layers.28.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
763
+ "vision_model.encoder.layers.28.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
764
+ "vision_model.encoder.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
765
+ "vision_model.encoder.layers.28.self_attn.out_proj.bias": "pytorch_model-00003-of-00004.bin",
766
+ "vision_model.encoder.layers.28.self_attn.out_proj.weight": "pytorch_model-00003-of-00004.bin",
767
+ "vision_model.encoder.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
768
+ "vision_model.encoder.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
769
+ "vision_model.encoder.layers.29.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
770
+ "vision_model.encoder.layers.29.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
771
+ "vision_model.encoder.layers.29.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
772
+ "vision_model.encoder.layers.29.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
773
+ "vision_model.encoder.layers.29.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
774
+ "vision_model.encoder.layers.29.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
775
+ "vision_model.encoder.layers.29.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
776
+ "vision_model.encoder.layers.29.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
777
+ "vision_model.encoder.layers.29.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
778
+ "vision_model.encoder.layers.29.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
779
+ "vision_model.encoder.layers.29.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
780
+ "vision_model.encoder.layers.3.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
781
+ "vision_model.encoder.layers.3.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
782
+ "vision_model.encoder.layers.3.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
783
+ "vision_model.encoder.layers.3.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
784
+ "vision_model.encoder.layers.3.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
785
+ "vision_model.encoder.layers.3.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
786
+ "vision_model.encoder.layers.3.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
787
+ "vision_model.encoder.layers.3.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
788
+ "vision_model.encoder.layers.3.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
789
+ "vision_model.encoder.layers.3.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
790
+ "vision_model.encoder.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
791
+ "vision_model.encoder.layers.30.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
792
+ "vision_model.encoder.layers.30.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
793
+ "vision_model.encoder.layers.30.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
794
+ "vision_model.encoder.layers.30.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
795
+ "vision_model.encoder.layers.30.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
796
+ "vision_model.encoder.layers.30.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
797
+ "vision_model.encoder.layers.30.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
798
+ "vision_model.encoder.layers.30.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
799
+ "vision_model.encoder.layers.30.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
800
+ "vision_model.encoder.layers.30.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
801
+ "vision_model.encoder.layers.30.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
802
+ "vision_model.encoder.layers.31.layer_norm1.weight": "pytorch_model-00004-of-00004.bin",
803
+ "vision_model.encoder.layers.31.layer_norm2.weight": "pytorch_model-00004-of-00004.bin",
804
+ "vision_model.encoder.layers.31.mlp.fc1.bias": "pytorch_model-00004-of-00004.bin",
805
+ "vision_model.encoder.layers.31.mlp.fc1.weight": "pytorch_model-00004-of-00004.bin",
806
+ "vision_model.encoder.layers.31.mlp.fc2.bias": "pytorch_model-00004-of-00004.bin",
807
+ "vision_model.encoder.layers.31.mlp.fc2.weight": "pytorch_model-00004-of-00004.bin",
808
+ "vision_model.encoder.layers.31.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
809
+ "vision_model.encoder.layers.31.self_attn.out_proj.bias": "pytorch_model-00004-of-00004.bin",
810
+ "vision_model.encoder.layers.31.self_attn.out_proj.weight": "pytorch_model-00004-of-00004.bin",
811
+ "vision_model.encoder.layers.31.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
812
+ "vision_model.encoder.layers.31.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
813
+ "vision_model.encoder.layers.4.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
814
+ "vision_model.encoder.layers.4.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
815
+ "vision_model.encoder.layers.4.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
816
+ "vision_model.encoder.layers.4.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
817
+ "vision_model.encoder.layers.4.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
818
+ "vision_model.encoder.layers.4.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
819
+ "vision_model.encoder.layers.4.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
820
+ "vision_model.encoder.layers.4.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
821
+ "vision_model.encoder.layers.4.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
822
+ "vision_model.encoder.layers.4.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
823
+ "vision_model.encoder.layers.4.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
824
+ "vision_model.encoder.layers.5.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
825
+ "vision_model.encoder.layers.5.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
826
+ "vision_model.encoder.layers.5.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
827
+ "vision_model.encoder.layers.5.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
828
+ "vision_model.encoder.layers.5.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
829
+ "vision_model.encoder.layers.5.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
830
+ "vision_model.encoder.layers.5.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
831
+ "vision_model.encoder.layers.5.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
832
+ "vision_model.encoder.layers.5.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
833
+ "vision_model.encoder.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
834
+ "vision_model.encoder.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
835
+ "vision_model.encoder.layers.6.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
836
+ "vision_model.encoder.layers.6.layer_norm2.weight": "pytorch_model-00001-of-00004.bin",
837
+ "vision_model.encoder.layers.6.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
838
+ "vision_model.encoder.layers.6.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
839
+ "vision_model.encoder.layers.6.mlp.fc2.bias": "pytorch_model-00001-of-00004.bin",
840
+ "vision_model.encoder.layers.6.mlp.fc2.weight": "pytorch_model-00001-of-00004.bin",
841
+ "vision_model.encoder.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
842
+ "vision_model.encoder.layers.6.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
843
+ "vision_model.encoder.layers.6.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
844
+ "vision_model.encoder.layers.6.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
845
+ "vision_model.encoder.layers.6.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
846
+ "vision_model.encoder.layers.7.layer_norm1.weight": "pytorch_model-00001-of-00004.bin",
847
+ "vision_model.encoder.layers.7.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
848
+ "vision_model.encoder.layers.7.mlp.fc1.bias": "pytorch_model-00001-of-00004.bin",
849
+ "vision_model.encoder.layers.7.mlp.fc1.weight": "pytorch_model-00001-of-00004.bin",
850
+ "vision_model.encoder.layers.7.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
851
+ "vision_model.encoder.layers.7.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
852
+ "vision_model.encoder.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
853
+ "vision_model.encoder.layers.7.self_attn.out_proj.bias": "pytorch_model-00001-of-00004.bin",
854
+ "vision_model.encoder.layers.7.self_attn.out_proj.weight": "pytorch_model-00001-of-00004.bin",
855
+ "vision_model.encoder.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
856
+ "vision_model.encoder.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
857
+ "vision_model.encoder.layers.8.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
858
+ "vision_model.encoder.layers.8.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
859
+ "vision_model.encoder.layers.8.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
860
+ "vision_model.encoder.layers.8.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
861
+ "vision_model.encoder.layers.8.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
862
+ "vision_model.encoder.layers.8.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
863
+ "vision_model.encoder.layers.8.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
864
+ "vision_model.encoder.layers.8.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
865
+ "vision_model.encoder.layers.8.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
866
+ "vision_model.encoder.layers.8.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
867
+ "vision_model.encoder.layers.8.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
868
+ "vision_model.encoder.layers.9.layer_norm1.weight": "pytorch_model-00002-of-00004.bin",
869
+ "vision_model.encoder.layers.9.layer_norm2.weight": "pytorch_model-00002-of-00004.bin",
870
+ "vision_model.encoder.layers.9.mlp.fc1.bias": "pytorch_model-00002-of-00004.bin",
871
+ "vision_model.encoder.layers.9.mlp.fc1.weight": "pytorch_model-00002-of-00004.bin",
872
+ "vision_model.encoder.layers.9.mlp.fc2.bias": "pytorch_model-00002-of-00004.bin",
873
+ "vision_model.encoder.layers.9.mlp.fc2.weight": "pytorch_model-00002-of-00004.bin",
874
+ "vision_model.encoder.layers.9.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
875
+ "vision_model.encoder.layers.9.self_attn.out_proj.bias": "pytorch_model-00002-of-00004.bin",
876
+ "vision_model.encoder.layers.9.self_attn.out_proj.weight": "pytorch_model-00002-of-00004.bin",
877
+ "vision_model.encoder.layers.9.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
878
+ "vision_model.encoder.layers.9.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
879
+ "vision_model.post_layernorm.weight": "pytorch_model-00004-of-00004.bin",
880
+ "visual_projection.weight": "pytorch_model-00004-of-00004.bin"
881
+ }
882
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|startoftext|>",
4
+ "lstrip": false,
5
+ "normalized": true,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": true,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<|endoftext|>",
17
+ "unk_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": true,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "__type": "AddedToken",
4
+ "content": "<|startoftext|>",
5
+ "lstrip": false,
6
+ "normalized": true,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ },
10
+ "eos_token": {
11
+ "__type": "AddedToken",
12
+ "content": "<|endoftext|>",
13
+ "lstrip": false,
14
+ "normalized": true,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "errors": "replace",
19
+ "model_max_length": 1000000000000000019884624838656,
20
+ "pad_token": "<|endoftext|>",
21
+ "special_tokens_map_file": null,
22
+ "tokenizer_class": "CLIPTokenizer",
23
+ "unk_token": {
24
+ "__type": "AddedToken",
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": true,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff