ahmed-masry commited on
Commit
91ae748
1 Parent(s): b6e6767

Upload LlavaT5ForConditionalGeneration

Browse files
chartinstruct_flant5_modeling.py CHANGED
@@ -8,592 +8,14 @@ from transformers import AutoConfig, AutoModelForSeq2SeqLM, \
8
  T5Config, T5Model, T5ForConditionalGeneration
9
 
10
  from transformers.models.t5.modeling_t5 import T5Stack
11
- from transformers.modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput, BaseModelOutput
12
  from transformers.utils import ModelOutput
13
  from transformers import DonutSwinModel, DonutImageProcessor, DonutSwinConfig
14
-
15
  from abc import ABC, abstractmethod
16
  import re
17
 
18
- # Model Constants
19
- IGNORE_INDEX = -100
20
- IMAGE_TOKEN_INDEX = -200
21
- DEFAULT_IMAGE_TOKEN = "<image>"
22
- DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
23
- DEFAULT_IM_START_TOKEN = "<im_start>"
24
- DEFAULT_IM_END_TOKEN = "<im_end>"
25
-
26
- class UniChartVisionTower(nn.Module):
27
- def __init__(self, vision_tower, args, delay_load=False):
28
- super().__init__()
29
-
30
- self.is_loaded = False
31
-
32
- self.vision_tower_name = vision_tower
33
- self.select_layer = args.mm_vision_select_layer
34
- self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
35
-
36
- if not delay_load:
37
- self.load_model()
38
- else:
39
- self.cfg_only = DonutSwinConfig.from_pretrained(self.vision_tower_name)
40
-
41
- def load_model(self):
42
- self.image_processor = DonutImageProcessor.from_pretrained(self.vision_tower_name)
43
- self.vision_tower = DonutSwinModel.from_pretrained(self.vision_tower_name)
44
-
45
- # Changed. Check for this variable. It's false by default.
46
- if not self.tune_vision_encoder:
47
- self.vision_tower.requires_grad_(False)
48
-
49
- self.is_loaded = True
50
-
51
- def feature_select(self, image_forward_outs):
52
- image_features = image_forward_outs.hidden_states[self.select_layer]
53
- if self.select_feature == 'patch':
54
- image_features = image_features[:, 1:]
55
- elif self.select_feature == 'cls_patch':
56
- image_features = image_features
57
- else:
58
- raise ValueError(f'Unexpected select feature: {self.select_feature}')
59
- return image_features
60
-
61
- @torch.no_grad()
62
- def forward(self, images):
63
- if type(images) is list:
64
- image_features = []
65
- for image in images:
66
- image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
67
- image_feature = self.feature_select(image_forward_out).to(image.dtype)
68
- image_features.append(image_feature)
69
- else:
70
- image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
71
- image_features = self.feature_select(image_forward_outs).to(images.dtype)
72
-
73
- return image_features
74
-
75
- @property
76
- def dummy_feature(self):
77
- return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
78
-
79
- @property
80
- def dtype(self):
81
- return self.vision_tower.dtype
82
-
83
- @property
84
- def device(self):
85
- return self.vision_tower.device
86
-
87
- @property
88
- def config(self):
89
- if self.is_loaded:
90
- return self.vision_tower.config
91
- else:
92
- return self.cfg_only
93
-
94
- @property
95
- def hidden_size(self):
96
- return self.config.hidden_size
97
-
98
- @property
99
- def num_patches(self):
100
- return (self.config.image_size // self.config.patch_size) ** 2
101
-
102
- def build_vision_tower(vision_tower_cfg, **kwargs):
103
- vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
104
- is_absolute_path_exists = os.path.exists(vision_tower)
105
- if is_absolute_path_exists:
106
- if 'unichart' in vision_tower:
107
- return UniChartVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
108
-
109
- raise ValueError(f'Unknown vision tower: {vision_tower}')
110
-
111
- def build_vision_projector(config, delay_load=False, **kwargs):
112
- projector_type = getattr(config, 'mm_projector_type', 'mlp3x_gelu')
113
-
114
- if projector_type == 'linear':
115
- return nn.Linear(config.mm_hidden_size, config.hidden_size)
116
-
117
- mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
118
- if mlp_gelu_match:
119
- mlp_depth = int(mlp_gelu_match.group(1))
120
- modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
121
- for _ in range(1, mlp_depth):
122
- modules.append(nn.GELU())
123
- modules.append(nn.Linear(config.hidden_size, config.hidden_size))
124
- return nn.Sequential(*modules)
125
-
126
- raise ValueError(f'Unknown projector type: {projector_type}')
127
-
128
- # Copyright 2023 Haotian Liu
129
- #
130
- # Licensed under the Apache License, Version 2.0 (the "License");
131
- # you may not use this file except in compliance with the License.
132
- # You may obtain a copy of the License at
133
- #
134
- # http://www.apache.org/licenses/LICENSE-2.0
135
- #
136
- # Unless required by applicable law or agreed to in writing, software
137
- # distributed under the License is distributed on an "AS IS" BASIS,
138
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
139
- # See the License for the specific language governing permissions and
140
- # limitations under the License.
141
-
142
-
143
- class LlavaMetaModel:
144
-
145
- def __init__(self, config): #, embed_tokens):
146
- super(LlavaMetaModel, self).__init__(config) #, embed_tokens)
147
- if hasattr(config, "mm_vision_tower"):
148
- self.vision_tower = build_vision_tower(config, delay_load=True)
149
- self.mm_projector = build_vision_projector(self.config) #nn.Linear(config.mm_hidden_size, config.hidden_size)
150
-
151
- def get_vision_tower(self):
152
- vision_tower = getattr(self, 'vision_tower', None)
153
- if type(vision_tower) is list:
154
- vision_tower = vision_tower[0]
155
- return vision_tower
156
-
157
- def initialize_vision_modules(self, model_args, fsdp=None):
158
- vision_tower = model_args.vision_tower
159
- mm_vision_select_layer = model_args.mm_vision_select_layer
160
- mm_vision_select_feature = model_args.mm_vision_select_feature
161
- pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
162
-
163
- self.config.mm_vision_tower = vision_tower
164
-
165
- vision_tower = build_vision_tower(model_args)
166
-
167
- if fsdp is not None and len(fsdp) > 0:
168
- self.vision_tower = [vision_tower]
169
- else:
170
- self.vision_tower = vision_tower
171
-
172
- self.config.use_mm_proj = True
173
- self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
174
- self.config.mm_hidden_size = vision_tower.hidden_size
175
- self.config.mm_vision_select_layer = mm_vision_select_layer
176
- self.config.mm_vision_select_feature = mm_vision_select_feature
177
-
178
- if not hasattr(self, 'mm_projector'):
179
- self.mm_projector = build_vision_projector(self.config) #nn.Linear(self.config.mm_hidden_size, self.config.hidden_size)
180
-
181
- if pretrain_mm_mlp_adapter is not None:
182
- mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
183
- def get_w(weights, keyword):
184
- return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
185
-
186
- self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
187
-
188
-
189
- class LlavaMetaForCausalLM(ABC):
190
-
191
- @abstractmethod
192
- def get_model(self):
193
- pass
194
-
195
- def get_vision_tower(self):
196
- return self.get_model().get_vision_tower()
197
-
198
- def encode_images(self, images):
199
- image_features = self.get_model().get_vision_tower()(images)
200
- image_features = self.get_model().mm_projector(image_features)
201
- return image_features
202
-
203
- def prepare_inputs_labels_for_multimodal(
204
- self, input_ids, attention_mask, past_key_values, labels, images
205
- ):
206
- vision_tower = self.get_vision_tower()
207
- if vision_tower is None or images is None or input_ids.shape[1] == 1:
208
- if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
209
- attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
210
- return input_ids, attention_mask, past_key_values, None, labels
211
-
212
- if type(images) is list or images.ndim == 5:
213
- concat_images = torch.cat([image for image in images], dim=0)
214
- image_features = self.encode_images(concat_images)
215
- split_sizes = [image.shape[0] for image in images]
216
- image_features = torch.split(image_features, split_sizes, dim=0)
217
- image_features = [x.flatten(0, 1) for x in image_features]
218
- else:
219
- image_features = self.encode_images(images)
220
-
221
- new_input_embeds = []
222
- new_labels = [] if labels is not None else None
223
- cur_image_idx = 0
224
- for batch_idx, cur_input_ids in enumerate(input_ids):
225
- if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
226
- # multimodal LLM, but the current sample is not multimodal
227
- cur_input_embeds = self.get_model().embed_tokens(cur_input_ids)
228
- cur_input_embeds = cur_input_embeds + (0. * self.get_model().mm_projector(vision_tower.dummy_feature)).sum()
229
- new_input_embeds.append(cur_input_embeds)
230
- if labels is not None:
231
- new_labels.append(labels[batch_idx])
232
- cur_image_idx += 1
233
- continue
234
- image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
235
- cur_new_input_embeds = []
236
- if labels is not None:
237
- cur_labels = labels[batch_idx]
238
- cur_new_labels = []
239
- assert cur_labels.shape == cur_input_ids.shape
240
- while image_token_indices.numel() > 0:
241
- cur_image_features = image_features[cur_image_idx]
242
- image_token_start = image_token_indices[0]
243
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
244
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach())
245
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
246
- cur_new_input_embeds.append(cur_image_features)
247
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2]))
248
- if labels is not None:
249
- cur_new_labels.append(cur_labels[:image_token_start])
250
- cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
251
- cur_new_labels.append(cur_labels[image_token_start:image_token_start+1])
252
- cur_labels = cur_labels[image_token_start+2:]
253
- else:
254
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
255
- cur_new_input_embeds.append(cur_image_features)
256
- if labels is not None:
257
- cur_new_labels.append(cur_labels[:image_token_start])
258
- cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
259
- cur_labels = cur_labels[image_token_start+1:]
260
- cur_image_idx += 1
261
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
262
- cur_input_ids = cur_input_ids[image_token_start+2:]
263
- else:
264
- cur_input_ids = cur_input_ids[image_token_start+1:]
265
- image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
266
- if cur_input_ids.numel() > 0:
267
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
268
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach())
269
- else:
270
- cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
271
- if labels is not None:
272
- cur_new_labels.append(cur_labels)
273
- cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
274
- cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
275
- new_input_embeds.append(cur_new_input_embeds)
276
- if labels is not None:
277
- cur_new_labels = torch.cat(cur_new_labels, dim=0)
278
- new_labels.append(cur_new_labels)
279
-
280
- if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
281
- max_len = max(x.shape[0] for x in new_input_embeds)
282
-
283
- new_input_embeds_align = []
284
- for cur_new_embed in new_input_embeds:
285
- cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
286
- new_input_embeds_align.append(cur_new_embed)
287
- new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
288
-
289
- if labels is not None:
290
- new_labels_align = []
291
- _new_labels = new_labels
292
- for cur_new_label in new_labels:
293
- cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
294
- new_labels_align.append(cur_new_label)
295
- new_labels = torch.stack(new_labels_align, dim=0)
296
-
297
- if attention_mask is not None:
298
- new_attention_mask = []
299
- for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
300
- new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
301
- new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
302
- cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
303
- new_attention_mask.append(cur_new_attention_mask)
304
- attention_mask = torch.stack(new_attention_mask, dim=0)
305
- assert attention_mask.shape == new_labels.shape
306
- else:
307
- new_input_embeds = torch.stack(new_input_embeds, dim=0)
308
- if labels is not None:
309
- new_labels = torch.stack(new_labels, dim=0)
310
-
311
- if attention_mask is not None:
312
- new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
313
- attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
314
- assert attention_mask.shape == new_input_embeds.shape[:2]
315
-
316
- return None, attention_mask, past_key_values, new_input_embeds, new_labels
317
-
318
- def initialize_vision_tokenizer(self, model_args, tokenizer):
319
- if model_args.mm_use_im_patch_token:
320
- tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
321
- self.resize_token_embeddings(len(tokenizer))
322
-
323
- if model_args.mm_use_im_start_end:
324
- num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
325
- self.resize_token_embeddings(len(tokenizer))
326
-
327
- if num_new_tokens > 0:
328
- input_embeddings = self.get_input_embeddings().weight.data
329
- output_embeddings = self.get_output_embeddings().weight.data
330
-
331
- input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
332
- dim=0, keepdim=True)
333
- output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
334
- dim=0, keepdim=True)
335
-
336
- input_embeddings[-num_new_tokens:] = input_embeddings_avg
337
- output_embeddings[-num_new_tokens:] = output_embeddings_avg
338
-
339
- if model_args.tune_mm_mlp_adapter:
340
- for p in self.get_input_embeddings().parameters():
341
- p.requires_grad = True
342
- for p in self.get_output_embeddings().parameters():
343
- p.requires_grad = False
344
-
345
- if model_args.pretrain_mm_mlp_adapter:
346
- mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
347
- embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
348
- assert num_new_tokens == 2
349
- if input_embeddings.shape == embed_tokens_weight.shape:
350
- input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
351
- elif embed_tokens_weight.shape[0] == num_new_tokens:
352
- input_embeddings[-num_new_tokens:] = embed_tokens_weight
353
- else:
354
- raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
355
- elif model_args.mm_use_im_patch_token:
356
- if model_args.tune_mm_mlp_adapter:
357
- for p in self.get_input_embeddings().parameters():
358
- p.requires_grad = False
359
- for p in self.get_output_embeddings().parameters():
360
- p.requires_grad = False
361
-
362
-
363
-
364
- class LlavaMetaForConditionalGeneration(ABC):
365
-
366
- def get_vision_tower(self):
367
- return self.get_encoder().get_vision_tower()
368
-
369
- def encode_images(self, images):
370
- image_features = self.get_encoder().get_vision_tower()(images)
371
- image_features = self.get_encoder().mm_projector(image_features)
372
- return image_features
373
-
374
- def prepare_inputs_labels_for_multimodal(
375
- self, input_ids, attention_mask, labels, images
376
- ):
377
- vision_tower = self.get_vision_tower()
378
- if vision_tower is None or images is None or input_ids.shape[1] == 1:
379
- return input_ids, attention_mask, None
380
-
381
- if type(images) is list or images.ndim == 5:
382
- concat_images = torch.cat([image for image in images], dim=0)
383
- image_features = self.encode_images(concat_images)
384
- split_sizes = [image.shape[0] for image in images]
385
- image_features = torch.split(image_features, split_sizes, dim=0)
386
- image_features = [x.flatten(0, 1) for x in image_features]
387
- else:
388
- image_features = self.encode_images(images)
389
-
390
- # TODO: double check.
391
- if labels is None:
392
- labels = torch.full_like(input_ids, IGNORE_INDEX)
393
- ######
394
-
395
- new_input_embeds = []
396
- new_labels = [] if labels is not None else None
397
- cur_image_idx = 0
398
- for batch_idx, cur_input_ids in enumerate(input_ids):
399
- if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
400
- # multimodal LLM, but the current sample is not multimodal
401
- cur_input_embeds = self.get_encoder().embed_tokens(cur_input_ids)
402
- cur_input_embeds = cur_input_embeds + (0. * self.get_encoder().mm_projector(vision_tower.dummy_feature)).sum()
403
- new_input_embeds.append(cur_input_embeds)
404
- if labels is not None:
405
- new_labels.append(labels[batch_idx])
406
- cur_image_idx += 1
407
- continue
408
- image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
409
- cur_new_input_embeds = []
410
- if labels is not None:
411
- cur_labels = labels[batch_idx]
412
- cur_new_labels = []
413
- assert cur_labels.shape == cur_input_ids.shape
414
- while image_token_indices.numel() > 0:
415
- cur_image_features = image_features[cur_image_idx]
416
- image_token_start = image_token_indices[0]
417
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
418
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids[:image_token_start-1]).detach())
419
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
420
- cur_new_input_embeds.append(cur_image_features)
421
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids[image_token_start+1:image_token_start+2]))
422
- if labels is not None:
423
- cur_new_labels.append(cur_labels[:image_token_start])
424
- cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
425
- cur_new_labels.append(cur_labels[image_token_start:image_token_start+1])
426
- cur_labels = cur_labels[image_token_start+2:]
427
- else:
428
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids[:image_token_start]))
429
- cur_new_input_embeds.append(cur_image_features)
430
- if labels is not None:
431
- cur_new_labels.append(cur_labels[:image_token_start])
432
- cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
433
- cur_labels = cur_labels[image_token_start+1:]
434
- cur_image_idx += 1
435
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
436
- cur_input_ids = cur_input_ids[image_token_start+2:]
437
- else:
438
- cur_input_ids = cur_input_ids[image_token_start+1:]
439
- image_token_indices = torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0]
440
- if cur_input_ids.numel() > 0:
441
- if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
442
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids).detach())
443
- else:
444
- cur_new_input_embeds.append(self.get_encoder().embed_tokens(cur_input_ids))
445
- if labels is not None:
446
- cur_new_labels.append(cur_labels)
447
- cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
448
- cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
449
- new_input_embeds.append(cur_new_input_embeds)
450
- if labels is not None:
451
- cur_new_labels = torch.cat(cur_new_labels, dim=0)
452
- new_labels.append(cur_new_labels)
453
-
454
- if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
455
- max_len = max(x.shape[0] for x in new_input_embeds)
456
-
457
- new_input_embeds_align = []
458
- for cur_new_embed in new_input_embeds:
459
- cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
460
- new_input_embeds_align.append(cur_new_embed)
461
- new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
462
-
463
- if labels is not None:
464
- new_labels_align = []
465
- _new_labels = new_labels
466
- for cur_new_label in new_labels:
467
- cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
468
- new_labels_align.append(cur_new_label)
469
- new_labels = torch.stack(new_labels_align, dim=0)
470
-
471
- if attention_mask is not None:
472
- new_attention_mask = []
473
- for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
474
- new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
475
- new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
476
- cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
477
- new_attention_mask.append(cur_new_attention_mask)
478
- attention_mask = torch.stack(new_attention_mask, dim=0)
479
- assert attention_mask.shape == new_labels.shape
480
- else:
481
- new_input_embeds = torch.stack(new_input_embeds, dim=0)
482
- if labels is not None:
483
- new_labels = torch.stack(new_labels, dim=0)
484
-
485
- if attention_mask is not None:
486
- new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
487
- attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
488
- assert attention_mask.shape == new_input_embeds.shape[:2]
489
-
490
- return None, attention_mask, new_input_embeds, new_labels
491
-
492
- def initialize_vision_tokenizer(self, model_args, tokenizer):
493
- if model_args.mm_use_im_patch_token:
494
- tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
495
- self.resize_token_embeddings(len(tokenizer))
496
-
497
- if model_args.mm_use_im_start_end:
498
- num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
499
- self.resize_token_embeddings(len(tokenizer))
500
-
501
- if num_new_tokens > 0:
502
- input_embeddings = self.get_input_embeddings().weight.data
503
- output_embeddings = self.get_output_embeddings().weight.data
504
-
505
- input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
506
- dim=0, keepdim=True)
507
- output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
508
- dim=0, keepdim=True)
509
-
510
- input_embeddings[-num_new_tokens:] = input_embeddings_avg
511
- output_embeddings[-num_new_tokens:] = output_embeddings_avg
512
-
513
- if model_args.tune_mm_mlp_adapter:
514
- for p in self.get_input_embeddings().parameters():
515
- p.requires_grad = True
516
- for p in self.get_output_embeddings().parameters():
517
- p.requires_grad = False
518
-
519
- if model_args.pretrain_mm_mlp_adapter:
520
- mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
521
- embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
522
- assert num_new_tokens == 2
523
- if input_embeddings.shape == embed_tokens_weight.shape:
524
- input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
525
- elif embed_tokens_weight.shape[0] == num_new_tokens:
526
- input_embeddings[-num_new_tokens:] = embed_tokens_weight
527
- else:
528
- raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
529
- elif model_args.mm_use_im_patch_token:
530
- if model_args.tune_mm_mlp_adapter:
531
- for p in self.get_input_embeddings().parameters():
532
- p.requires_grad = False
533
- for p in self.get_output_embeddings().parameters():
534
- p.requires_grad = False
535
-
536
- class LlavaMetaT5Model:
537
-
538
- def __init__(self, config, embed_tokens):
539
- super(LlavaMetaT5Model, self).__init__(config, embed_tokens)
540
- if hasattr(config, "mm_vision_tower"):
541
- self.vision_tower = build_vision_tower(config, delay_load=True)
542
- self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
543
-
544
- def get_vision_tower(self):
545
- vision_tower = getattr(self, 'vision_tower', None)
546
- if type(vision_tower) is list:
547
- vision_tower = vision_tower[0]
548
- return vision_tower
549
-
550
- def initialize_vision_modules(self, model_args, fsdp=None):
551
- vision_tower = model_args.vision_tower
552
- mm_vision_select_layer = model_args.mm_vision_select_layer
553
- mm_vision_select_feature = model_args.mm_vision_select_feature
554
- pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
555
-
556
- self.config.mm_vision_tower = vision_tower
557
-
558
- vision_tower = build_vision_tower(model_args)
559
-
560
- if fsdp is not None and len(fsdp) > 0:
561
- self.vision_tower = [vision_tower]
562
- else:
563
- self.vision_tower = vision_tower
564
-
565
- self.config.use_mm_proj = True
566
- self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
567
- self.config.mm_hidden_size = vision_tower.hidden_size
568
- self.config.mm_vision_select_layer = mm_vision_select_layer
569
- self.config.mm_vision_select_feature = mm_vision_select_feature
570
-
571
- if not hasattr(self, 'mm_projector'):
572
- self.mm_projector = build_vision_projector(self.config) #nn.Linear(self.config.mm_hidden_size, self.config.hidden_size)
573
-
574
- if pretrain_mm_mlp_adapter is not None:
575
- mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
576
- def get_w(weights, keyword):
577
- return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
578
-
579
- self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
580
-
581
-
582
-
583
- # Copyright 2023 Haotian Liu
584
- #
585
- # Licensed under the Apache License, Version 2.0 (the "License");
586
- # you may not use this file except in compliance with the License.
587
- # You may obtain a copy of the License at
588
- #
589
- # http://www.apache.org/licenses/LICENSE-2.0
590
- #
591
- # Unless required by applicable law or agreed to in writing, software
592
- # distributed under the License is distributed on an "AS IS" BASIS,
593
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
594
- # See the License for the specific language governing permissions and
595
- # limitations under the License.
596
-
597
 
598
 
599
  @dataclass
@@ -645,14 +67,340 @@ class BaseModelOutputWithPastAndCrossAttentionsWithAttentionMask(ModelOutput):
645
  class LlavaT5Config(T5Config):
646
  model_type = "llava_t5"
647
 
648
- class LlavaT5Stack(LlavaMetaT5Model, T5Stack):
 
 
649
  config_class = LlavaT5Config
650
 
651
- def __init__(self, config: T5Config, embed_tokens=None):
652
- super(LlavaT5Stack, self).__init__(config, embed_tokens)
653
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
654
 
655
- class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForConditionalGeneration):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
656
  config_class = LlavaT5Config
657
 
658
  def __init__(self, config):
@@ -700,7 +448,7 @@ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForCo
700
  use_cache: Optional[bool] = None,
701
  output_attentions: Optional[bool] = None,
702
  output_hidden_states: Optional[bool] = None,
703
- images: Optional[torch.FloatTensor] = None,
704
  return_dict: Optional[bool] = None,
705
 
706
  decoder_input_ids: Optional[torch.LongTensor] = None,
@@ -723,17 +471,16 @@ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForCo
723
  #warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
724
  decoder_head_mask = head_mask
725
 
 
 
 
726
  # Encode if needed (training, first prediction pass)
727
  if encoder_outputs is None:
728
- input_ids, attention_mask, inputs_embeds, _ = self.prepare_inputs_labels_for_multimodal(input_ids,
729
- attention_mask,
730
- None, # Important: keep it None
731
- images
732
- )
733
  # Convert encoder inputs in embeddings if needed
734
  encoder_outputs = self.encoder(
735
  input_ids=input_ids,
736
  attention_mask=attention_mask,
 
737
  inputs_embeds=inputs_embeds,
738
  head_mask=head_mask,
739
  output_attentions=output_attentions,
@@ -746,6 +493,7 @@ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForCo
746
  hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
747
  attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
748
  )
 
749
 
750
  hidden_states = encoder_outputs[0]
751
 
@@ -768,7 +516,6 @@ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForCo
768
  decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
769
 
770
 
771
- # Decode
772
  decoder_outputs = self.decoder(
773
  input_ids=decoder_input_ids,
774
  attention_mask=decoder_attention_mask,
@@ -858,6 +605,5 @@ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration, LlavaMetaForCo
858
  "decoder_attention_mask": decoder_attention_mask,
859
  "cross_attn_head_mask": cross_attn_head_mask,
860
  "use_cache": use_cache,
861
- "images": kwargs.get("images", None),
862
  }
863
-
 
8
  T5Config, T5Model, T5ForConditionalGeneration
9
 
10
  from transformers.models.t5.modeling_t5 import T5Stack
11
+ from transformers.modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput, BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions
12
  from transformers.utils import ModelOutput
13
  from transformers import DonutSwinModel, DonutImageProcessor, DonutSwinConfig
 
14
  from abc import ABC, abstractmethod
15
  import re
16
 
17
+ from transformers import T5PreTrainedModel
18
+ from transformers.models.t5.modeling_t5 import T5Block, T5LayerNorm
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
 
21
  @dataclass
 
67
  class LlavaT5Config(T5Config):
68
  model_type = "llava_t5"
69
 
70
+
71
+
72
+ class LlavaT5Stack(T5PreTrainedModel):
73
  config_class = LlavaT5Config
74
 
75
+ def __init__(self, config, embed_tokens=None):
76
+ super().__init__(config)
77
 
78
+ self.embed_tokens = embed_tokens
79
+ self.is_decoder = config.is_decoder
80
+
81
+ self.block = nn.ModuleList(
82
+ [T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]
83
+ )
84
+ self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
85
+ self.dropout = nn.Dropout(config.dropout_rate)
86
+
87
+ ## Vision
88
+ self.vision_tower = DonutSwinModel(config=config.vision_config)
89
+ self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
90
+ self.pad_token_id = 0
91
+ self.image_token_index = 32100
92
+ ##
93
+
94
+ # Initialize weights and apply final processing
95
+ self.post_init()
96
+ # Model parallel
97
+ self.model_parallel = False
98
+ self.device_map = None
99
+ self.gradient_checkpointing = False
100
+
101
+ def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask):
102
+ num_images, num_image_patches, embed_dim = image_features.shape
103
+ batch_size, sequence_length = input_ids.shape
104
+ left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
105
+ # 1. Create a mask to know where special image tokens are
106
+ special_image_token_mask = input_ids == self.image_token_index
107
+ num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
108
+ # Compute the maximum embed dimension
109
+ max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
110
+ batch_indices, non_image_indices = torch.where(input_ids != self.image_token_index)
111
+
112
+ # 2. Compute the positions where text should be written
113
+ # Calculate new positions for text tokens in merged image-text sequence.
114
+ # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
115
+ # `torch.cumsum` computes how each image token shifts subsequent text token positions.
116
+ # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
117
+ new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
118
+ nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
119
+ if left_padding:
120
+ new_token_positions += nb_image_pad[:, None] # offset for left padding
121
+ text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
122
+
123
+ # 3. Create the full embedding, already padded to the maximum position
124
+ final_embedding = torch.zeros(
125
+ batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
126
+ )
127
+ final_attention_mask = torch.zeros(
128
+ batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
129
+ )
130
 
131
+ # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
132
+ # set the corresponding tensors into their correct target device.
133
+ target_device = inputs_embeds.device
134
+ batch_indices, non_image_indices, text_to_overwrite = (
135
+ batch_indices.to(target_device),
136
+ non_image_indices.to(target_device),
137
+ text_to_overwrite.to(target_device),
138
+ )
139
+ attention_mask = attention_mask.to(target_device)
140
+
141
+ # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
142
+ # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
143
+ final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
144
+ final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
145
+
146
+ # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
147
+ image_to_overwrite = torch.full(
148
+ (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
149
+ )
150
+ image_to_overwrite[batch_indices, text_to_overwrite] = False
151
+ image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
152
+
153
+ if image_to_overwrite.sum() != image_features.shape[:-1].numel():
154
+ raise ValueError(
155
+ f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
156
+ f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
157
+ )
158
+
159
+ final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
160
+ final_attention_mask |= image_to_overwrite
161
+
162
+ # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
163
+ batch_indices, pad_indices = torch.where(input_ids == self.pad_token_id)
164
+ indices_to_mask = new_token_positions[batch_indices, pad_indices]
165
+
166
+ final_embedding[batch_indices, indices_to_mask] = 0
167
+
168
+ return final_embedding, final_attention_mask
169
+
170
+ def forward(
171
+ self,
172
+ input_ids=None,
173
+ attention_mask=None,
174
+ pixel_values=None,
175
+ encoder_hidden_states=None,
176
+ encoder_attention_mask=None,
177
+ inputs_embeds=None,
178
+ head_mask=None,
179
+ cross_attn_head_mask=None,
180
+ past_key_values=None,
181
+ use_cache=None,
182
+ output_attentions=None,
183
+ output_hidden_states=None,
184
+ return_dict=None,
185
+ ):
186
+ # Model parallel
187
+ if self.model_parallel:
188
+ torch.cuda.set_device(self.first_device)
189
+ self.embed_tokens = self.embed_tokens.to(self.first_device)
190
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
191
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
192
+ output_hidden_states = (
193
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
194
+ )
195
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
196
+
197
+ if input_ids is not None and inputs_embeds is not None:
198
+ err_msg_prefix = "decoder_" if self.is_decoder else ""
199
+ raise ValueError(
200
+ f"You cannot specify both {err_msg_prefix}input_ids and {err_msg_prefix}inputs_embeds at the same time"
201
+ )
202
+ elif input_ids is not None:
203
+ input_shape = input_ids.size()
204
+ input_ids = input_ids.view(-1, input_shape[-1])
205
+ elif inputs_embeds is not None:
206
+ input_shape = inputs_embeds.size()[:-1]
207
+ else:
208
+ err_msg_prefix = "decoder_" if self.is_decoder else ""
209
+ raise ValueError(f"You have to specify either {err_msg_prefix}input_ids or {err_msg_prefix}inputs_embeds")
210
+
211
+ if inputs_embeds is None:
212
+ if self.embed_tokens is None:
213
+ raise ValueError("You have to initialize the model with valid token embeddings")
214
+ inputs_embeds = self.embed_tokens(input_ids)
215
+
216
+ ### Multimodal
217
+ vision_feature_layer = -1
218
+ vision_feature_select_strategy = "default"
219
+ image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
220
+ # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
221
+ selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
222
+
223
+ if vision_feature_select_strategy == "default":
224
+ selected_image_feature = selected_image_feature[:, 1:]
225
+ elif vision_feature_select_strategy == "full":
226
+ selected_image_feature = selected_image_feature
227
+ else:
228
+ raise ValueError(
229
+ f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
230
+ )
231
+
232
+ image_features = self.mm_projector(selected_image_feature)
233
+ inputs_embeds = inputs_embeds.to(image_features.dtype)
234
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(
235
+ image_features, inputs_embeds, input_ids, attention_mask
236
+ )
237
+ input_shape = inputs_embeds.size()[:-1]
238
+ #################
239
+
240
+ batch_size, seq_length = input_shape
241
+
242
+ # required mask seq length can be calculated via length of past
243
+ mask_seq_length = past_key_values[0][0].shape[2] + seq_length if past_key_values is not None else seq_length
244
+
245
+ if use_cache is True:
246
+ if not self.is_decoder:
247
+ raise ValueError(f"`use_cache` can only be set to `True` if {self} is used as a decoder")
248
+
249
+ # initialize past_key_values with `None` if past does not exist
250
+ if past_key_values is None:
251
+ past_key_values = [None] * len(self.block)
252
+
253
+ if attention_mask is None:
254
+ attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
255
+
256
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
257
+ # ourselves in which case we just need to make it broadcastable to all heads.
258
+ extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
259
+
260
+ # If a 2D or 3D attention mask is provided for the cross-attention
261
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
262
+ if self.is_decoder and encoder_hidden_states is not None:
263
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
264
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
265
+ if encoder_attention_mask is None:
266
+ encoder_attention_mask = torch.ones(
267
+ encoder_hidden_shape, device=inputs_embeds.device, dtype=torch.long
268
+ )
269
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
270
+ else:
271
+ encoder_extended_attention_mask = None
272
+
273
+ if self.gradient_checkpointing and self.training:
274
+ if use_cache:
275
+ # logger.warning_once(
276
+ # "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
277
+ # )
278
+ use_cache = False
279
+
280
+ # Prepare head mask if needed
281
+ head_mask = self.get_head_mask(head_mask, self.config.num_layers)
282
+ cross_attn_head_mask = self.get_head_mask(cross_attn_head_mask, self.config.num_layers)
283
+ present_key_value_states = () if use_cache else None
284
+ all_hidden_states = () if output_hidden_states else None
285
+ all_attentions = () if output_attentions else None
286
+ all_cross_attentions = () if (output_attentions and self.is_decoder) else None
287
+ position_bias = None
288
+ encoder_decoder_position_bias = None
289
+
290
+ hidden_states = self.dropout(inputs_embeds)
291
+
292
+ for i, (layer_module, past_key_value) in enumerate(zip(self.block, past_key_values)):
293
+ layer_head_mask = head_mask[i]
294
+ cross_attn_layer_head_mask = cross_attn_head_mask[i]
295
+ # Model parallel
296
+ if self.model_parallel:
297
+ torch.cuda.set_device(hidden_states.device)
298
+ # Ensure that attention_mask is always on the same device as hidden_states
299
+ if attention_mask is not None:
300
+ attention_mask = attention_mask.to(hidden_states.device)
301
+ if position_bias is not None:
302
+ position_bias = position_bias.to(hidden_states.device)
303
+ if encoder_hidden_states is not None:
304
+ encoder_hidden_states = encoder_hidden_states.to(hidden_states.device)
305
+ if encoder_extended_attention_mask is not None:
306
+ encoder_extended_attention_mask = encoder_extended_attention_mask.to(hidden_states.device)
307
+ if encoder_decoder_position_bias is not None:
308
+ encoder_decoder_position_bias = encoder_decoder_position_bias.to(hidden_states.device)
309
+ if layer_head_mask is not None:
310
+ layer_head_mask = layer_head_mask.to(hidden_states.device)
311
+ if cross_attn_layer_head_mask is not None:
312
+ cross_attn_layer_head_mask = cross_attn_layer_head_mask.to(hidden_states.device)
313
+ if output_hidden_states:
314
+ all_hidden_states = all_hidden_states + (hidden_states,)
315
+
316
+ if self.gradient_checkpointing and self.training:
317
+ layer_outputs = self._gradient_checkpointing_func(
318
+ layer_module.forward,
319
+ hidden_states,
320
+ extended_attention_mask,
321
+ position_bias,
322
+ encoder_hidden_states,
323
+ encoder_extended_attention_mask,
324
+ encoder_decoder_position_bias,
325
+ layer_head_mask,
326
+ cross_attn_layer_head_mask,
327
+ None, # past_key_value is always None with gradient checkpointing
328
+ use_cache,
329
+ output_attentions,
330
+ )
331
+ else:
332
+ layer_outputs = layer_module(
333
+ hidden_states,
334
+ attention_mask=extended_attention_mask,
335
+ position_bias=position_bias,
336
+ encoder_hidden_states=encoder_hidden_states,
337
+ encoder_attention_mask=encoder_extended_attention_mask,
338
+ encoder_decoder_position_bias=encoder_decoder_position_bias,
339
+ layer_head_mask=layer_head_mask,
340
+ cross_attn_layer_head_mask=cross_attn_layer_head_mask,
341
+ past_key_value=past_key_value,
342
+ use_cache=use_cache,
343
+ output_attentions=output_attentions,
344
+ )
345
+
346
+ # layer_outputs is a tuple with:
347
+ # hidden-states, key-value-states, (self-attention position bias), (self-attention weights), (cross-attention position bias), (cross-attention weights)
348
+ if use_cache is False:
349
+ layer_outputs = layer_outputs[:1] + (None,) + layer_outputs[1:]
350
+
351
+ hidden_states, present_key_value_state = layer_outputs[:2]
352
+
353
+ # We share the position biases between the layers - the first layer store them
354
+ # layer_outputs = hidden-states, key-value-states (self-attention position bias), (self-attention weights),
355
+ # (cross-attention position bias), (cross-attention weights)
356
+ position_bias = layer_outputs[2]
357
+ if self.is_decoder and encoder_hidden_states is not None:
358
+ encoder_decoder_position_bias = layer_outputs[4 if output_attentions else 3]
359
+ # append next layer key value states
360
+ if use_cache:
361
+ present_key_value_states = present_key_value_states + (present_key_value_state,)
362
+
363
+ if output_attentions:
364
+ all_attentions = all_attentions + (layer_outputs[3],)
365
+ if self.is_decoder:
366
+ all_cross_attentions = all_cross_attentions + (layer_outputs[5],)
367
+
368
+ # Model Parallel: If it's the last layer for that device, put things on the next device
369
+ if self.model_parallel:
370
+ for k, v in self.device_map.items():
371
+ if i == v[-1] and "cuda:" + str(k) != self.last_device:
372
+ hidden_states = hidden_states.to("cuda:" + str(k + 1))
373
+
374
+ hidden_states = self.final_layer_norm(hidden_states)
375
+ hidden_states = self.dropout(hidden_states)
376
+
377
+ # Add last layer
378
+ if output_hidden_states:
379
+ all_hidden_states = all_hidden_states + (hidden_states,)
380
+
381
+ if not return_dict:
382
+ return tuple(
383
+ v
384
+ for v in [
385
+ hidden_states,
386
+ present_key_value_states,
387
+ all_hidden_states,
388
+ all_attentions,
389
+ all_cross_attentions,
390
+ ]
391
+ if v is not None
392
+ )
393
+ return BaseModelOutputWithPastAndCrossAttentionsWithAttentionMask(
394
+ last_hidden_state=hidden_states,
395
+ past_key_values=present_key_value_states,
396
+ hidden_states=all_hidden_states,
397
+ attentions=all_attentions,
398
+ cross_attentions=all_cross_attentions,
399
+ attention_mask=attention_mask,
400
+ )
401
+
402
+
403
+ class LlavaT5ForConditionalGeneration(T5ForConditionalGeneration):
404
  config_class = LlavaT5Config
405
 
406
  def __init__(self, config):
 
448
  use_cache: Optional[bool] = None,
449
  output_attentions: Optional[bool] = None,
450
  output_hidden_states: Optional[bool] = None,
451
+ pixel_values: Optional[torch.FloatTensor] = None,
452
  return_dict: Optional[bool] = None,
453
 
454
  decoder_input_ids: Optional[torch.LongTensor] = None,
 
471
  #warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning)
472
  decoder_head_mask = head_mask
473
 
474
+ if encoder_outputs is not None:
475
+ attention_mask = encoder_outputs.attention_mask
476
+
477
  # Encode if needed (training, first prediction pass)
478
  if encoder_outputs is None:
 
 
 
 
 
479
  # Convert encoder inputs in embeddings if needed
480
  encoder_outputs = self.encoder(
481
  input_ids=input_ids,
482
  attention_mask=attention_mask,
483
+ pixel_values=pixel_values,
484
  inputs_embeds=inputs_embeds,
485
  head_mask=head_mask,
486
  output_attentions=output_attentions,
 
493
  hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
494
  attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
495
  )
496
+
497
 
498
  hidden_states = encoder_outputs[0]
499
 
 
516
  decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device)
517
 
518
 
 
519
  decoder_outputs = self.decoder(
520
  input_ids=decoder_input_ids,
521
  attention_mask=decoder_attention_mask,
 
605
  "decoder_attention_mask": decoder_attention_mask,
606
  "cross_attn_head_mask": cross_attn_head_mask,
607
  "use_cache": use_cache,
608
+ "pixel_values": kwargs.get("pixel_values", None),
609
  }
 
config.json CHANGED
@@ -28,7 +28,7 @@
28
  "mm_use_im_start_end": false,
29
  "mm_vision_select_feature": "patch",
30
  "mm_vision_select_layer": -1,
31
- "mm_vision_tower": "/content/unichart-encoder-512",
32
  "model_type": "llava_t5",
33
  "n_positions": 512,
34
  "num_decoder_layers": 24,
@@ -73,5 +73,102 @@
73
  "tune_mm_mlp_adapter": false,
74
  "use_cache": false,
75
  "use_mm_proj": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  "vocab_size": 32128
77
  }
 
28
  "mm_use_im_start_end": false,
29
  "mm_vision_select_feature": "patch",
30
  "mm_vision_select_layer": -1,
31
+ "mm_vision_tower": "/home/masry20/projects/def-enamul/masry20/llms-models/unichart-encoder-512",
32
  "model_type": "llava_t5",
33
  "n_positions": 512,
34
  "num_decoder_layers": 24,
 
73
  "tune_mm_mlp_adapter": false,
74
  "use_cache": false,
75
  "use_mm_proj": true,
76
+ "vision_config": {
77
+ "_name_or_path": "",
78
+ "add_cross_attention": false,
79
+ "architectures": [
80
+ "DonutSwinModel"
81
+ ],
82
+ "attention_probs_dropout_prob": 0.0,
83
+ "bad_words_ids": null,
84
+ "begin_suppress_tokens": null,
85
+ "bos_token_id": null,
86
+ "chunk_size_feed_forward": 0,
87
+ "cross_attention_hidden_size": null,
88
+ "decoder_start_token_id": null,
89
+ "depths": [
90
+ 2,
91
+ 2,
92
+ 14,
93
+ 2
94
+ ],
95
+ "diversity_penalty": 0.0,
96
+ "do_sample": false,
97
+ "drop_path_rate": 0.1,
98
+ "early_stopping": false,
99
+ "embed_dim": 128,
100
+ "encoder_no_repeat_ngram_size": 0,
101
+ "eos_token_id": null,
102
+ "exponential_decay_length_penalty": null,
103
+ "finetuning_task": null,
104
+ "forced_bos_token_id": null,
105
+ "forced_eos_token_id": null,
106
+ "hidden_act": "gelu",
107
+ "hidden_dropout_prob": 0.0,
108
+ "hidden_size": 1024,
109
+ "id2label": {
110
+ "0": "LABEL_0",
111
+ "1": "LABEL_1"
112
+ },
113
+ "image_size": [
114
+ 512,
115
+ 512
116
+ ],
117
+ "initializer_range": 0.02,
118
+ "is_decoder": false,
119
+ "is_encoder_decoder": false,
120
+ "label2id": {
121
+ "LABEL_0": 0,
122
+ "LABEL_1": 1
123
+ },
124
+ "layer_norm_eps": 1e-05,
125
+ "length_penalty": 1.0,
126
+ "max_length": 20,
127
+ "min_length": 0,
128
+ "mlp_ratio": 4.0,
129
+ "model_type": "donut-swin",
130
+ "no_repeat_ngram_size": 0,
131
+ "num_beam_groups": 1,
132
+ "num_beams": 1,
133
+ "num_channels": 3,
134
+ "num_heads": [
135
+ 4,
136
+ 8,
137
+ 16,
138
+ 32
139
+ ],
140
+ "num_layers": 4,
141
+ "num_return_sequences": 1,
142
+ "output_attentions": false,
143
+ "output_hidden_states": false,
144
+ "output_scores": false,
145
+ "pad_token_id": null,
146
+ "patch_size": 4,
147
+ "path_norm": true,
148
+ "prefix": null,
149
+ "problem_type": null,
150
+ "pruned_heads": {},
151
+ "qkv_bias": true,
152
+ "remove_invalid_values": false,
153
+ "repetition_penalty": 1.0,
154
+ "return_dict": true,
155
+ "return_dict_in_generate": false,
156
+ "sep_token_id": null,
157
+ "suppress_tokens": null,
158
+ "task_specific_params": null,
159
+ "temperature": 1.0,
160
+ "tf_legacy_loss": false,
161
+ "tie_encoder_decoder": false,
162
+ "tie_word_embeddings": true,
163
+ "tokenizer_class": null,
164
+ "top_k": 50,
165
+ "top_p": 1.0,
166
+ "torch_dtype": "float32",
167
+ "torchscript": false,
168
+ "typical_p": 1.0,
169
+ "use_absolute_embeddings": false,
170
+ "use_bfloat16": false,
171
+ "window_size": 10
172
+ },
173
  "vocab_size": 32128
174
  }
model-00001-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:719f6af29df4fe3cae844752de131258afdebe6bad05a513533d654a7f3a2ad9
3
- size 4986432872
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6b8f697fd0f6cd28ad003060372ec0787530fb6aec464ec575039f5b089001c
3
+ size 4998389912
model-00002-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:71428b8d193b4234668328861cd30d0abd6ce25bccdaa12de5ce0aa90fde81e9
3
- size 4991730200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d52e625d16b7cd51b5d0cd2e4f8ee1a9e593b540be21346150e7a68c2124990
3
+ size 4984514272
model-00003-of-00003.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c0059c19a7881bdff91877bedc0b46cd6d2cc6ee1b3ab6ea0581fa41604e6aaa
3
- size 1429331352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3bfe7d13f294454178cf4857235c8d521fc45fa02a8f0a9e876a18ae5e58ce8d
3
+ size 1722967504
model.safetensors.index.json CHANGED
@@ -1,17 +1,17 @@
1
  {
2
  "metadata": {
3
- "total_size": 11407425536
4
  },
5
  "weight_map": {
6
- "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00001-of-00003.safetensors",
7
- "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00001-of-00003.safetensors",
8
- "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00001-of-00003.safetensors",
9
- "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00001-of-00003.safetensors",
10
- "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00001-of-00003.safetensors",
11
- "decoder.block.0.layer.0.layer_norm.weight": "model-00001-of-00003.safetensors",
12
  "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
13
  "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
14
- "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00001-of-00003.safetensors",
15
  "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
16
  "decoder.block.0.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
17
  "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
@@ -150,24 +150,24 @@
150
  "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
151
  "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
152
  "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
153
- "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
154
  "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
155
- "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
156
- "decoder.block.18.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
157
- "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
158
- "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00002-of-00003.safetensors",
159
- "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00002-of-00003.safetensors",
160
- "decoder.block.18.layer.2.layer_norm.weight": "model-00002-of-00003.safetensors",
161
- "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
162
- "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
163
- "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
164
- "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
165
- "decoder.block.19.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
166
- "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
167
- "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
168
- "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
169
- "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
170
- "decoder.block.19.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
171
  "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
172
  "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
173
  "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
@@ -559,8 +559,381 @@
559
  "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
560
  "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
561
  "encoder.final_layer_norm.weight": "model-00001-of-00003.safetensors",
562
- "encoder.mm_projector.bias": "model-00001-of-00003.safetensors",
563
- "encoder.mm_projector.weight": "model-00001-of-00003.safetensors",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
564
  "lm_head.weight": "model-00003-of-00003.safetensors",
565
  "shared.weight": "model-00001-of-00003.safetensors"
566
  }
 
1
  {
2
  "metadata": {
3
+ "total_size": 11705748448
4
  },
5
  "weight_map": {
6
+ "decoder.block.0.layer.0.SelfAttention.k.weight": "model-00002-of-00003.safetensors",
7
+ "decoder.block.0.layer.0.SelfAttention.o.weight": "model-00002-of-00003.safetensors",
8
+ "decoder.block.0.layer.0.SelfAttention.q.weight": "model-00002-of-00003.safetensors",
9
+ "decoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight": "model-00002-of-00003.safetensors",
10
+ "decoder.block.0.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
11
+ "decoder.block.0.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
12
  "decoder.block.0.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
13
  "decoder.block.0.layer.1.EncDecAttention.o.weight": "model-00002-of-00003.safetensors",
14
+ "decoder.block.0.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
15
  "decoder.block.0.layer.1.EncDecAttention.v.weight": "model-00002-of-00003.safetensors",
16
  "decoder.block.0.layer.1.layer_norm.weight": "model-00002-of-00003.safetensors",
17
  "decoder.block.0.layer.2.DenseReluDense.wi_0.weight": "model-00002-of-00003.safetensors",
 
150
  "decoder.block.18.layer.0.SelfAttention.v.weight": "model-00002-of-00003.safetensors",
151
  "decoder.block.18.layer.0.layer_norm.weight": "model-00002-of-00003.safetensors",
152
  "decoder.block.18.layer.1.EncDecAttention.k.weight": "model-00002-of-00003.safetensors",
153
+ "decoder.block.18.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
154
  "decoder.block.18.layer.1.EncDecAttention.q.weight": "model-00002-of-00003.safetensors",
155
+ "decoder.block.18.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
156
+ "decoder.block.18.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
157
+ "decoder.block.18.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
158
+ "decoder.block.18.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
159
+ "decoder.block.18.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
160
+ "decoder.block.18.layer.2.layer_norm.weight": "model-00003-of-00003.safetensors",
161
+ "decoder.block.19.layer.0.SelfAttention.k.weight": "model-00003-of-00003.safetensors",
162
+ "decoder.block.19.layer.0.SelfAttention.o.weight": "model-00003-of-00003.safetensors",
163
+ "decoder.block.19.layer.0.SelfAttention.q.weight": "model-00003-of-00003.safetensors",
164
+ "decoder.block.19.layer.0.SelfAttention.v.weight": "model-00003-of-00003.safetensors",
165
+ "decoder.block.19.layer.0.layer_norm.weight": "model-00003-of-00003.safetensors",
166
+ "decoder.block.19.layer.1.EncDecAttention.k.weight": "model-00003-of-00003.safetensors",
167
+ "decoder.block.19.layer.1.EncDecAttention.o.weight": "model-00003-of-00003.safetensors",
168
+ "decoder.block.19.layer.1.EncDecAttention.q.weight": "model-00003-of-00003.safetensors",
169
+ "decoder.block.19.layer.1.EncDecAttention.v.weight": "model-00003-of-00003.safetensors",
170
+ "decoder.block.19.layer.1.layer_norm.weight": "model-00003-of-00003.safetensors",
171
  "decoder.block.19.layer.2.DenseReluDense.wi_0.weight": "model-00003-of-00003.safetensors",
172
  "decoder.block.19.layer.2.DenseReluDense.wi_1.weight": "model-00003-of-00003.safetensors",
173
  "decoder.block.19.layer.2.DenseReluDense.wo.weight": "model-00003-of-00003.safetensors",
 
559
  "encoder.block.9.layer.1.DenseReluDense.wo.weight": "model-00001-of-00003.safetensors",
560
  "encoder.block.9.layer.1.layer_norm.weight": "model-00001-of-00003.safetensors",
561
  "encoder.final_layer_norm.weight": "model-00001-of-00003.safetensors",
562
+ "encoder.mm_projector.bias": "model-00002-of-00003.safetensors",
563
+ "encoder.mm_projector.weight": "model-00002-of-00003.safetensors",
564
+ "encoder.vision_tower.embeddings.norm.bias": "model-00001-of-00003.safetensors",
565
+ "encoder.vision_tower.embeddings.norm.weight": "model-00001-of-00003.safetensors",
566
+ "encoder.vision_tower.embeddings.patch_embeddings.projection.bias": "model-00001-of-00003.safetensors",
567
+ "encoder.vision_tower.embeddings.patch_embeddings.projection.weight": "model-00001-of-00003.safetensors",
568
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.output.dense.bias": "model-00001-of-00003.safetensors",
569
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.output.dense.weight": "model-00001-of-00003.safetensors",
570
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.key.bias": "model-00001-of-00003.safetensors",
571
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.key.weight": "model-00001-of-00003.safetensors",
572
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.query.bias": "model-00001-of-00003.safetensors",
573
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.query.weight": "model-00001-of-00003.safetensors",
574
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
575
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
576
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.value.bias": "model-00001-of-00003.safetensors",
577
+ "encoder.vision_tower.encoder.layers.0.blocks.0.attention.self.value.weight": "model-00001-of-00003.safetensors",
578
+ "encoder.vision_tower.encoder.layers.0.blocks.0.intermediate.dense.bias": "model-00001-of-00003.safetensors",
579
+ "encoder.vision_tower.encoder.layers.0.blocks.0.intermediate.dense.weight": "model-00001-of-00003.safetensors",
580
+ "encoder.vision_tower.encoder.layers.0.blocks.0.layernorm_after.bias": "model-00001-of-00003.safetensors",
581
+ "encoder.vision_tower.encoder.layers.0.blocks.0.layernorm_after.weight": "model-00001-of-00003.safetensors",
582
+ "encoder.vision_tower.encoder.layers.0.blocks.0.layernorm_before.bias": "model-00001-of-00003.safetensors",
583
+ "encoder.vision_tower.encoder.layers.0.blocks.0.layernorm_before.weight": "model-00001-of-00003.safetensors",
584
+ "encoder.vision_tower.encoder.layers.0.blocks.0.output.dense.bias": "model-00001-of-00003.safetensors",
585
+ "encoder.vision_tower.encoder.layers.0.blocks.0.output.dense.weight": "model-00001-of-00003.safetensors",
586
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.output.dense.bias": "model-00001-of-00003.safetensors",
587
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.output.dense.weight": "model-00001-of-00003.safetensors",
588
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.key.bias": "model-00001-of-00003.safetensors",
589
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.key.weight": "model-00001-of-00003.safetensors",
590
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.query.bias": "model-00001-of-00003.safetensors",
591
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.query.weight": "model-00001-of-00003.safetensors",
592
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
593
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
594
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.value.bias": "model-00001-of-00003.safetensors",
595
+ "encoder.vision_tower.encoder.layers.0.blocks.1.attention.self.value.weight": "model-00001-of-00003.safetensors",
596
+ "encoder.vision_tower.encoder.layers.0.blocks.1.intermediate.dense.bias": "model-00001-of-00003.safetensors",
597
+ "encoder.vision_tower.encoder.layers.0.blocks.1.intermediate.dense.weight": "model-00001-of-00003.safetensors",
598
+ "encoder.vision_tower.encoder.layers.0.blocks.1.layernorm_after.bias": "model-00001-of-00003.safetensors",
599
+ "encoder.vision_tower.encoder.layers.0.blocks.1.layernorm_after.weight": "model-00001-of-00003.safetensors",
600
+ "encoder.vision_tower.encoder.layers.0.blocks.1.layernorm_before.bias": "model-00001-of-00003.safetensors",
601
+ "encoder.vision_tower.encoder.layers.0.blocks.1.layernorm_before.weight": "model-00001-of-00003.safetensors",
602
+ "encoder.vision_tower.encoder.layers.0.blocks.1.output.dense.bias": "model-00001-of-00003.safetensors",
603
+ "encoder.vision_tower.encoder.layers.0.blocks.1.output.dense.weight": "model-00001-of-00003.safetensors",
604
+ "encoder.vision_tower.encoder.layers.0.downsample.norm.bias": "model-00001-of-00003.safetensors",
605
+ "encoder.vision_tower.encoder.layers.0.downsample.norm.weight": "model-00001-of-00003.safetensors",
606
+ "encoder.vision_tower.encoder.layers.0.downsample.reduction.weight": "model-00001-of-00003.safetensors",
607
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.output.dense.bias": "model-00001-of-00003.safetensors",
608
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.output.dense.weight": "model-00001-of-00003.safetensors",
609
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.key.bias": "model-00001-of-00003.safetensors",
610
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.key.weight": "model-00001-of-00003.safetensors",
611
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.query.bias": "model-00001-of-00003.safetensors",
612
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.query.weight": "model-00001-of-00003.safetensors",
613
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
614
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
615
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.value.bias": "model-00001-of-00003.safetensors",
616
+ "encoder.vision_tower.encoder.layers.1.blocks.0.attention.self.value.weight": "model-00001-of-00003.safetensors",
617
+ "encoder.vision_tower.encoder.layers.1.blocks.0.intermediate.dense.bias": "model-00001-of-00003.safetensors",
618
+ "encoder.vision_tower.encoder.layers.1.blocks.0.intermediate.dense.weight": "model-00001-of-00003.safetensors",
619
+ "encoder.vision_tower.encoder.layers.1.blocks.0.layernorm_after.bias": "model-00001-of-00003.safetensors",
620
+ "encoder.vision_tower.encoder.layers.1.blocks.0.layernorm_after.weight": "model-00001-of-00003.safetensors",
621
+ "encoder.vision_tower.encoder.layers.1.blocks.0.layernorm_before.bias": "model-00001-of-00003.safetensors",
622
+ "encoder.vision_tower.encoder.layers.1.blocks.0.layernorm_before.weight": "model-00001-of-00003.safetensors",
623
+ "encoder.vision_tower.encoder.layers.1.blocks.0.output.dense.bias": "model-00001-of-00003.safetensors",
624
+ "encoder.vision_tower.encoder.layers.1.blocks.0.output.dense.weight": "model-00001-of-00003.safetensors",
625
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.output.dense.bias": "model-00001-of-00003.safetensors",
626
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.output.dense.weight": "model-00001-of-00003.safetensors",
627
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.key.bias": "model-00001-of-00003.safetensors",
628
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.key.weight": "model-00001-of-00003.safetensors",
629
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.query.bias": "model-00001-of-00003.safetensors",
630
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.query.weight": "model-00001-of-00003.safetensors",
631
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
632
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
633
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.value.bias": "model-00001-of-00003.safetensors",
634
+ "encoder.vision_tower.encoder.layers.1.blocks.1.attention.self.value.weight": "model-00001-of-00003.safetensors",
635
+ "encoder.vision_tower.encoder.layers.1.blocks.1.intermediate.dense.bias": "model-00001-of-00003.safetensors",
636
+ "encoder.vision_tower.encoder.layers.1.blocks.1.intermediate.dense.weight": "model-00001-of-00003.safetensors",
637
+ "encoder.vision_tower.encoder.layers.1.blocks.1.layernorm_after.bias": "model-00001-of-00003.safetensors",
638
+ "encoder.vision_tower.encoder.layers.1.blocks.1.layernorm_after.weight": "model-00001-of-00003.safetensors",
639
+ "encoder.vision_tower.encoder.layers.1.blocks.1.layernorm_before.bias": "model-00001-of-00003.safetensors",
640
+ "encoder.vision_tower.encoder.layers.1.blocks.1.layernorm_before.weight": "model-00001-of-00003.safetensors",
641
+ "encoder.vision_tower.encoder.layers.1.blocks.1.output.dense.bias": "model-00001-of-00003.safetensors",
642
+ "encoder.vision_tower.encoder.layers.1.blocks.1.output.dense.weight": "model-00001-of-00003.safetensors",
643
+ "encoder.vision_tower.encoder.layers.1.downsample.norm.bias": "model-00001-of-00003.safetensors",
644
+ "encoder.vision_tower.encoder.layers.1.downsample.norm.weight": "model-00001-of-00003.safetensors",
645
+ "encoder.vision_tower.encoder.layers.1.downsample.reduction.weight": "model-00001-of-00003.safetensors",
646
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.output.dense.bias": "model-00001-of-00003.safetensors",
647
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.output.dense.weight": "model-00001-of-00003.safetensors",
648
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.key.bias": "model-00001-of-00003.safetensors",
649
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.key.weight": "model-00001-of-00003.safetensors",
650
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.query.bias": "model-00001-of-00003.safetensors",
651
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.query.weight": "model-00001-of-00003.safetensors",
652
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
653
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
654
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.value.bias": "model-00001-of-00003.safetensors",
655
+ "encoder.vision_tower.encoder.layers.2.blocks.0.attention.self.value.weight": "model-00001-of-00003.safetensors",
656
+ "encoder.vision_tower.encoder.layers.2.blocks.0.intermediate.dense.bias": "model-00001-of-00003.safetensors",
657
+ "encoder.vision_tower.encoder.layers.2.blocks.0.intermediate.dense.weight": "model-00001-of-00003.safetensors",
658
+ "encoder.vision_tower.encoder.layers.2.blocks.0.layernorm_after.bias": "model-00001-of-00003.safetensors",
659
+ "encoder.vision_tower.encoder.layers.2.blocks.0.layernorm_after.weight": "model-00001-of-00003.safetensors",
660
+ "encoder.vision_tower.encoder.layers.2.blocks.0.layernorm_before.bias": "model-00001-of-00003.safetensors",
661
+ "encoder.vision_tower.encoder.layers.2.blocks.0.layernorm_before.weight": "model-00001-of-00003.safetensors",
662
+ "encoder.vision_tower.encoder.layers.2.blocks.0.output.dense.bias": "model-00001-of-00003.safetensors",
663
+ "encoder.vision_tower.encoder.layers.2.blocks.0.output.dense.weight": "model-00001-of-00003.safetensors",
664
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.output.dense.bias": "model-00001-of-00003.safetensors",
665
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.output.dense.weight": "model-00001-of-00003.safetensors",
666
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.key.bias": "model-00001-of-00003.safetensors",
667
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.key.weight": "model-00001-of-00003.safetensors",
668
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.query.bias": "model-00001-of-00003.safetensors",
669
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.query.weight": "model-00001-of-00003.safetensors",
670
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
671
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
672
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.value.bias": "model-00001-of-00003.safetensors",
673
+ "encoder.vision_tower.encoder.layers.2.blocks.1.attention.self.value.weight": "model-00001-of-00003.safetensors",
674
+ "encoder.vision_tower.encoder.layers.2.blocks.1.intermediate.dense.bias": "model-00001-of-00003.safetensors",
675
+ "encoder.vision_tower.encoder.layers.2.blocks.1.intermediate.dense.weight": "model-00001-of-00003.safetensors",
676
+ "encoder.vision_tower.encoder.layers.2.blocks.1.layernorm_after.bias": "model-00001-of-00003.safetensors",
677
+ "encoder.vision_tower.encoder.layers.2.blocks.1.layernorm_after.weight": "model-00001-of-00003.safetensors",
678
+ "encoder.vision_tower.encoder.layers.2.blocks.1.layernorm_before.bias": "model-00001-of-00003.safetensors",
679
+ "encoder.vision_tower.encoder.layers.2.blocks.1.layernorm_before.weight": "model-00001-of-00003.safetensors",
680
+ "encoder.vision_tower.encoder.layers.2.blocks.1.output.dense.bias": "model-00001-of-00003.safetensors",
681
+ "encoder.vision_tower.encoder.layers.2.blocks.1.output.dense.weight": "model-00001-of-00003.safetensors",
682
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.output.dense.bias": "model-00002-of-00003.safetensors",
683
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.output.dense.weight": "model-00002-of-00003.safetensors",
684
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.key.bias": "model-00002-of-00003.safetensors",
685
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.key.weight": "model-00002-of-00003.safetensors",
686
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.query.bias": "model-00002-of-00003.safetensors",
687
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.query.weight": "model-00002-of-00003.safetensors",
688
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
689
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
690
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.value.bias": "model-00002-of-00003.safetensors",
691
+ "encoder.vision_tower.encoder.layers.2.blocks.10.attention.self.value.weight": "model-00002-of-00003.safetensors",
692
+ "encoder.vision_tower.encoder.layers.2.blocks.10.intermediate.dense.bias": "model-00002-of-00003.safetensors",
693
+ "encoder.vision_tower.encoder.layers.2.blocks.10.intermediate.dense.weight": "model-00002-of-00003.safetensors",
694
+ "encoder.vision_tower.encoder.layers.2.blocks.10.layernorm_after.bias": "model-00002-of-00003.safetensors",
695
+ "encoder.vision_tower.encoder.layers.2.blocks.10.layernorm_after.weight": "model-00002-of-00003.safetensors",
696
+ "encoder.vision_tower.encoder.layers.2.blocks.10.layernorm_before.bias": "model-00002-of-00003.safetensors",
697
+ "encoder.vision_tower.encoder.layers.2.blocks.10.layernorm_before.weight": "model-00002-of-00003.safetensors",
698
+ "encoder.vision_tower.encoder.layers.2.blocks.10.output.dense.bias": "model-00002-of-00003.safetensors",
699
+ "encoder.vision_tower.encoder.layers.2.blocks.10.output.dense.weight": "model-00002-of-00003.safetensors",
700
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.output.dense.bias": "model-00002-of-00003.safetensors",
701
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.output.dense.weight": "model-00002-of-00003.safetensors",
702
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.key.bias": "model-00002-of-00003.safetensors",
703
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.key.weight": "model-00002-of-00003.safetensors",
704
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.query.bias": "model-00002-of-00003.safetensors",
705
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.query.weight": "model-00002-of-00003.safetensors",
706
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
707
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
708
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.value.bias": "model-00002-of-00003.safetensors",
709
+ "encoder.vision_tower.encoder.layers.2.blocks.11.attention.self.value.weight": "model-00002-of-00003.safetensors",
710
+ "encoder.vision_tower.encoder.layers.2.blocks.11.intermediate.dense.bias": "model-00002-of-00003.safetensors",
711
+ "encoder.vision_tower.encoder.layers.2.blocks.11.intermediate.dense.weight": "model-00002-of-00003.safetensors",
712
+ "encoder.vision_tower.encoder.layers.2.blocks.11.layernorm_after.bias": "model-00002-of-00003.safetensors",
713
+ "encoder.vision_tower.encoder.layers.2.blocks.11.layernorm_after.weight": "model-00002-of-00003.safetensors",
714
+ "encoder.vision_tower.encoder.layers.2.blocks.11.layernorm_before.bias": "model-00002-of-00003.safetensors",
715
+ "encoder.vision_tower.encoder.layers.2.blocks.11.layernorm_before.weight": "model-00002-of-00003.safetensors",
716
+ "encoder.vision_tower.encoder.layers.2.blocks.11.output.dense.bias": "model-00002-of-00003.safetensors",
717
+ "encoder.vision_tower.encoder.layers.2.blocks.11.output.dense.weight": "model-00002-of-00003.safetensors",
718
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.output.dense.bias": "model-00002-of-00003.safetensors",
719
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.output.dense.weight": "model-00002-of-00003.safetensors",
720
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.key.bias": "model-00002-of-00003.safetensors",
721
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.key.weight": "model-00002-of-00003.safetensors",
722
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.query.bias": "model-00002-of-00003.safetensors",
723
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.query.weight": "model-00002-of-00003.safetensors",
724
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
725
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
726
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.value.bias": "model-00002-of-00003.safetensors",
727
+ "encoder.vision_tower.encoder.layers.2.blocks.12.attention.self.value.weight": "model-00002-of-00003.safetensors",
728
+ "encoder.vision_tower.encoder.layers.2.blocks.12.intermediate.dense.bias": "model-00002-of-00003.safetensors",
729
+ "encoder.vision_tower.encoder.layers.2.blocks.12.intermediate.dense.weight": "model-00002-of-00003.safetensors",
730
+ "encoder.vision_tower.encoder.layers.2.blocks.12.layernorm_after.bias": "model-00002-of-00003.safetensors",
731
+ "encoder.vision_tower.encoder.layers.2.blocks.12.layernorm_after.weight": "model-00002-of-00003.safetensors",
732
+ "encoder.vision_tower.encoder.layers.2.blocks.12.layernorm_before.bias": "model-00002-of-00003.safetensors",
733
+ "encoder.vision_tower.encoder.layers.2.blocks.12.layernorm_before.weight": "model-00002-of-00003.safetensors",
734
+ "encoder.vision_tower.encoder.layers.2.blocks.12.output.dense.bias": "model-00002-of-00003.safetensors",
735
+ "encoder.vision_tower.encoder.layers.2.blocks.12.output.dense.weight": "model-00002-of-00003.safetensors",
736
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.output.dense.bias": "model-00002-of-00003.safetensors",
737
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.output.dense.weight": "model-00002-of-00003.safetensors",
738
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.key.bias": "model-00002-of-00003.safetensors",
739
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.key.weight": "model-00002-of-00003.safetensors",
740
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.query.bias": "model-00002-of-00003.safetensors",
741
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.query.weight": "model-00002-of-00003.safetensors",
742
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
743
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
744
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.value.bias": "model-00002-of-00003.safetensors",
745
+ "encoder.vision_tower.encoder.layers.2.blocks.13.attention.self.value.weight": "model-00002-of-00003.safetensors",
746
+ "encoder.vision_tower.encoder.layers.2.blocks.13.intermediate.dense.bias": "model-00002-of-00003.safetensors",
747
+ "encoder.vision_tower.encoder.layers.2.blocks.13.intermediate.dense.weight": "model-00002-of-00003.safetensors",
748
+ "encoder.vision_tower.encoder.layers.2.blocks.13.layernorm_after.bias": "model-00002-of-00003.safetensors",
749
+ "encoder.vision_tower.encoder.layers.2.blocks.13.layernorm_after.weight": "model-00002-of-00003.safetensors",
750
+ "encoder.vision_tower.encoder.layers.2.blocks.13.layernorm_before.bias": "model-00002-of-00003.safetensors",
751
+ "encoder.vision_tower.encoder.layers.2.blocks.13.layernorm_before.weight": "model-00002-of-00003.safetensors",
752
+ "encoder.vision_tower.encoder.layers.2.blocks.13.output.dense.bias": "model-00002-of-00003.safetensors",
753
+ "encoder.vision_tower.encoder.layers.2.blocks.13.output.dense.weight": "model-00002-of-00003.safetensors",
754
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.output.dense.bias": "model-00001-of-00003.safetensors",
755
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.output.dense.weight": "model-00001-of-00003.safetensors",
756
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.key.bias": "model-00001-of-00003.safetensors",
757
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.key.weight": "model-00001-of-00003.safetensors",
758
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.query.bias": "model-00001-of-00003.safetensors",
759
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.query.weight": "model-00001-of-00003.safetensors",
760
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
761
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
762
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.value.bias": "model-00001-of-00003.safetensors",
763
+ "encoder.vision_tower.encoder.layers.2.blocks.2.attention.self.value.weight": "model-00001-of-00003.safetensors",
764
+ "encoder.vision_tower.encoder.layers.2.blocks.2.intermediate.dense.bias": "model-00001-of-00003.safetensors",
765
+ "encoder.vision_tower.encoder.layers.2.blocks.2.intermediate.dense.weight": "model-00001-of-00003.safetensors",
766
+ "encoder.vision_tower.encoder.layers.2.blocks.2.layernorm_after.bias": "model-00001-of-00003.safetensors",
767
+ "encoder.vision_tower.encoder.layers.2.blocks.2.layernorm_after.weight": "model-00001-of-00003.safetensors",
768
+ "encoder.vision_tower.encoder.layers.2.blocks.2.layernorm_before.bias": "model-00001-of-00003.safetensors",
769
+ "encoder.vision_tower.encoder.layers.2.blocks.2.layernorm_before.weight": "model-00001-of-00003.safetensors",
770
+ "encoder.vision_tower.encoder.layers.2.blocks.2.output.dense.bias": "model-00001-of-00003.safetensors",
771
+ "encoder.vision_tower.encoder.layers.2.blocks.2.output.dense.weight": "model-00001-of-00003.safetensors",
772
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.output.dense.bias": "model-00001-of-00003.safetensors",
773
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.output.dense.weight": "model-00001-of-00003.safetensors",
774
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.key.bias": "model-00001-of-00003.safetensors",
775
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.key.weight": "model-00001-of-00003.safetensors",
776
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.query.bias": "model-00001-of-00003.safetensors",
777
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.query.weight": "model-00001-of-00003.safetensors",
778
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
779
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
780
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.value.bias": "model-00001-of-00003.safetensors",
781
+ "encoder.vision_tower.encoder.layers.2.blocks.3.attention.self.value.weight": "model-00001-of-00003.safetensors",
782
+ "encoder.vision_tower.encoder.layers.2.blocks.3.intermediate.dense.bias": "model-00001-of-00003.safetensors",
783
+ "encoder.vision_tower.encoder.layers.2.blocks.3.intermediate.dense.weight": "model-00001-of-00003.safetensors",
784
+ "encoder.vision_tower.encoder.layers.2.blocks.3.layernorm_after.bias": "model-00001-of-00003.safetensors",
785
+ "encoder.vision_tower.encoder.layers.2.blocks.3.layernorm_after.weight": "model-00001-of-00003.safetensors",
786
+ "encoder.vision_tower.encoder.layers.2.blocks.3.layernorm_before.bias": "model-00001-of-00003.safetensors",
787
+ "encoder.vision_tower.encoder.layers.2.blocks.3.layernorm_before.weight": "model-00001-of-00003.safetensors",
788
+ "encoder.vision_tower.encoder.layers.2.blocks.3.output.dense.bias": "model-00001-of-00003.safetensors",
789
+ "encoder.vision_tower.encoder.layers.2.blocks.3.output.dense.weight": "model-00001-of-00003.safetensors",
790
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.output.dense.bias": "model-00001-of-00003.safetensors",
791
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.output.dense.weight": "model-00001-of-00003.safetensors",
792
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.key.bias": "model-00001-of-00003.safetensors",
793
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.key.weight": "model-00001-of-00003.safetensors",
794
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.query.bias": "model-00001-of-00003.safetensors",
795
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.query.weight": "model-00001-of-00003.safetensors",
796
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
797
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
798
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.value.bias": "model-00001-of-00003.safetensors",
799
+ "encoder.vision_tower.encoder.layers.2.blocks.4.attention.self.value.weight": "model-00001-of-00003.safetensors",
800
+ "encoder.vision_tower.encoder.layers.2.blocks.4.intermediate.dense.bias": "model-00001-of-00003.safetensors",
801
+ "encoder.vision_tower.encoder.layers.2.blocks.4.intermediate.dense.weight": "model-00001-of-00003.safetensors",
802
+ "encoder.vision_tower.encoder.layers.2.blocks.4.layernorm_after.bias": "model-00001-of-00003.safetensors",
803
+ "encoder.vision_tower.encoder.layers.2.blocks.4.layernorm_after.weight": "model-00001-of-00003.safetensors",
804
+ "encoder.vision_tower.encoder.layers.2.blocks.4.layernorm_before.bias": "model-00001-of-00003.safetensors",
805
+ "encoder.vision_tower.encoder.layers.2.blocks.4.layernorm_before.weight": "model-00001-of-00003.safetensors",
806
+ "encoder.vision_tower.encoder.layers.2.blocks.4.output.dense.bias": "model-00001-of-00003.safetensors",
807
+ "encoder.vision_tower.encoder.layers.2.blocks.4.output.dense.weight": "model-00001-of-00003.safetensors",
808
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.output.dense.bias": "model-00001-of-00003.safetensors",
809
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.output.dense.weight": "model-00001-of-00003.safetensors",
810
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.key.bias": "model-00001-of-00003.safetensors",
811
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.key.weight": "model-00001-of-00003.safetensors",
812
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.query.bias": "model-00001-of-00003.safetensors",
813
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.query.weight": "model-00001-of-00003.safetensors",
814
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
815
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
816
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.value.bias": "model-00001-of-00003.safetensors",
817
+ "encoder.vision_tower.encoder.layers.2.blocks.5.attention.self.value.weight": "model-00001-of-00003.safetensors",
818
+ "encoder.vision_tower.encoder.layers.2.blocks.5.intermediate.dense.bias": "model-00001-of-00003.safetensors",
819
+ "encoder.vision_tower.encoder.layers.2.blocks.5.intermediate.dense.weight": "model-00001-of-00003.safetensors",
820
+ "encoder.vision_tower.encoder.layers.2.blocks.5.layernorm_after.bias": "model-00001-of-00003.safetensors",
821
+ "encoder.vision_tower.encoder.layers.2.blocks.5.layernorm_after.weight": "model-00001-of-00003.safetensors",
822
+ "encoder.vision_tower.encoder.layers.2.blocks.5.layernorm_before.bias": "model-00001-of-00003.safetensors",
823
+ "encoder.vision_tower.encoder.layers.2.blocks.5.layernorm_before.weight": "model-00001-of-00003.safetensors",
824
+ "encoder.vision_tower.encoder.layers.2.blocks.5.output.dense.bias": "model-00001-of-00003.safetensors",
825
+ "encoder.vision_tower.encoder.layers.2.blocks.5.output.dense.weight": "model-00001-of-00003.safetensors",
826
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.output.dense.bias": "model-00001-of-00003.safetensors",
827
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.output.dense.weight": "model-00001-of-00003.safetensors",
828
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.key.bias": "model-00001-of-00003.safetensors",
829
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.key.weight": "model-00001-of-00003.safetensors",
830
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.query.bias": "model-00001-of-00003.safetensors",
831
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.query.weight": "model-00001-of-00003.safetensors",
832
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
833
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
834
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.value.bias": "model-00001-of-00003.safetensors",
835
+ "encoder.vision_tower.encoder.layers.2.blocks.6.attention.self.value.weight": "model-00001-of-00003.safetensors",
836
+ "encoder.vision_tower.encoder.layers.2.blocks.6.intermediate.dense.bias": "model-00001-of-00003.safetensors",
837
+ "encoder.vision_tower.encoder.layers.2.blocks.6.intermediate.dense.weight": "model-00001-of-00003.safetensors",
838
+ "encoder.vision_tower.encoder.layers.2.blocks.6.layernorm_after.bias": "model-00001-of-00003.safetensors",
839
+ "encoder.vision_tower.encoder.layers.2.blocks.6.layernorm_after.weight": "model-00001-of-00003.safetensors",
840
+ "encoder.vision_tower.encoder.layers.2.blocks.6.layernorm_before.bias": "model-00001-of-00003.safetensors",
841
+ "encoder.vision_tower.encoder.layers.2.blocks.6.layernorm_before.weight": "model-00001-of-00003.safetensors",
842
+ "encoder.vision_tower.encoder.layers.2.blocks.6.output.dense.bias": "model-00001-of-00003.safetensors",
843
+ "encoder.vision_tower.encoder.layers.2.blocks.6.output.dense.weight": "model-00001-of-00003.safetensors",
844
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.output.dense.bias": "model-00001-of-00003.safetensors",
845
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.output.dense.weight": "model-00001-of-00003.safetensors",
846
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.key.bias": "model-00001-of-00003.safetensors",
847
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.key.weight": "model-00001-of-00003.safetensors",
848
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.query.bias": "model-00001-of-00003.safetensors",
849
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.query.weight": "model-00001-of-00003.safetensors",
850
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.relative_position_bias_table": "model-00001-of-00003.safetensors",
851
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.relative_position_index": "model-00001-of-00003.safetensors",
852
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.value.bias": "model-00001-of-00003.safetensors",
853
+ "encoder.vision_tower.encoder.layers.2.blocks.7.attention.self.value.weight": "model-00001-of-00003.safetensors",
854
+ "encoder.vision_tower.encoder.layers.2.blocks.7.intermediate.dense.bias": "model-00002-of-00003.safetensors",
855
+ "encoder.vision_tower.encoder.layers.2.blocks.7.intermediate.dense.weight": "model-00002-of-00003.safetensors",
856
+ "encoder.vision_tower.encoder.layers.2.blocks.7.layernorm_after.bias": "model-00001-of-00003.safetensors",
857
+ "encoder.vision_tower.encoder.layers.2.blocks.7.layernorm_after.weight": "model-00001-of-00003.safetensors",
858
+ "encoder.vision_tower.encoder.layers.2.blocks.7.layernorm_before.bias": "model-00001-of-00003.safetensors",
859
+ "encoder.vision_tower.encoder.layers.2.blocks.7.layernorm_before.weight": "model-00001-of-00003.safetensors",
860
+ "encoder.vision_tower.encoder.layers.2.blocks.7.output.dense.bias": "model-00002-of-00003.safetensors",
861
+ "encoder.vision_tower.encoder.layers.2.blocks.7.output.dense.weight": "model-00002-of-00003.safetensors",
862
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.output.dense.bias": "model-00002-of-00003.safetensors",
863
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.output.dense.weight": "model-00002-of-00003.safetensors",
864
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.key.bias": "model-00002-of-00003.safetensors",
865
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.key.weight": "model-00002-of-00003.safetensors",
866
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.query.bias": "model-00002-of-00003.safetensors",
867
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.query.weight": "model-00002-of-00003.safetensors",
868
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
869
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
870
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.value.bias": "model-00002-of-00003.safetensors",
871
+ "encoder.vision_tower.encoder.layers.2.blocks.8.attention.self.value.weight": "model-00002-of-00003.safetensors",
872
+ "encoder.vision_tower.encoder.layers.2.blocks.8.intermediate.dense.bias": "model-00002-of-00003.safetensors",
873
+ "encoder.vision_tower.encoder.layers.2.blocks.8.intermediate.dense.weight": "model-00002-of-00003.safetensors",
874
+ "encoder.vision_tower.encoder.layers.2.blocks.8.layernorm_after.bias": "model-00002-of-00003.safetensors",
875
+ "encoder.vision_tower.encoder.layers.2.blocks.8.layernorm_after.weight": "model-00002-of-00003.safetensors",
876
+ "encoder.vision_tower.encoder.layers.2.blocks.8.layernorm_before.bias": "model-00002-of-00003.safetensors",
877
+ "encoder.vision_tower.encoder.layers.2.blocks.8.layernorm_before.weight": "model-00002-of-00003.safetensors",
878
+ "encoder.vision_tower.encoder.layers.2.blocks.8.output.dense.bias": "model-00002-of-00003.safetensors",
879
+ "encoder.vision_tower.encoder.layers.2.blocks.8.output.dense.weight": "model-00002-of-00003.safetensors",
880
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.output.dense.bias": "model-00002-of-00003.safetensors",
881
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.output.dense.weight": "model-00002-of-00003.safetensors",
882
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.key.bias": "model-00002-of-00003.safetensors",
883
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.key.weight": "model-00002-of-00003.safetensors",
884
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.query.bias": "model-00002-of-00003.safetensors",
885
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.query.weight": "model-00002-of-00003.safetensors",
886
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
887
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
888
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.value.bias": "model-00002-of-00003.safetensors",
889
+ "encoder.vision_tower.encoder.layers.2.blocks.9.attention.self.value.weight": "model-00002-of-00003.safetensors",
890
+ "encoder.vision_tower.encoder.layers.2.blocks.9.intermediate.dense.bias": "model-00002-of-00003.safetensors",
891
+ "encoder.vision_tower.encoder.layers.2.blocks.9.intermediate.dense.weight": "model-00002-of-00003.safetensors",
892
+ "encoder.vision_tower.encoder.layers.2.blocks.9.layernorm_after.bias": "model-00002-of-00003.safetensors",
893
+ "encoder.vision_tower.encoder.layers.2.blocks.9.layernorm_after.weight": "model-00002-of-00003.safetensors",
894
+ "encoder.vision_tower.encoder.layers.2.blocks.9.layernorm_before.bias": "model-00002-of-00003.safetensors",
895
+ "encoder.vision_tower.encoder.layers.2.blocks.9.layernorm_before.weight": "model-00002-of-00003.safetensors",
896
+ "encoder.vision_tower.encoder.layers.2.blocks.9.output.dense.bias": "model-00002-of-00003.safetensors",
897
+ "encoder.vision_tower.encoder.layers.2.blocks.9.output.dense.weight": "model-00002-of-00003.safetensors",
898
+ "encoder.vision_tower.encoder.layers.2.downsample.norm.bias": "model-00002-of-00003.safetensors",
899
+ "encoder.vision_tower.encoder.layers.2.downsample.norm.weight": "model-00002-of-00003.safetensors",
900
+ "encoder.vision_tower.encoder.layers.2.downsample.reduction.weight": "model-00002-of-00003.safetensors",
901
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.output.dense.bias": "model-00002-of-00003.safetensors",
902
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.output.dense.weight": "model-00002-of-00003.safetensors",
903
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.key.bias": "model-00002-of-00003.safetensors",
904
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.key.weight": "model-00002-of-00003.safetensors",
905
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.query.bias": "model-00002-of-00003.safetensors",
906
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.query.weight": "model-00002-of-00003.safetensors",
907
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
908
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
909
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.value.bias": "model-00002-of-00003.safetensors",
910
+ "encoder.vision_tower.encoder.layers.3.blocks.0.attention.self.value.weight": "model-00002-of-00003.safetensors",
911
+ "encoder.vision_tower.encoder.layers.3.blocks.0.intermediate.dense.bias": "model-00002-of-00003.safetensors",
912
+ "encoder.vision_tower.encoder.layers.3.blocks.0.intermediate.dense.weight": "model-00002-of-00003.safetensors",
913
+ "encoder.vision_tower.encoder.layers.3.blocks.0.layernorm_after.bias": "model-00002-of-00003.safetensors",
914
+ "encoder.vision_tower.encoder.layers.3.blocks.0.layernorm_after.weight": "model-00002-of-00003.safetensors",
915
+ "encoder.vision_tower.encoder.layers.3.blocks.0.layernorm_before.bias": "model-00002-of-00003.safetensors",
916
+ "encoder.vision_tower.encoder.layers.3.blocks.0.layernorm_before.weight": "model-00002-of-00003.safetensors",
917
+ "encoder.vision_tower.encoder.layers.3.blocks.0.output.dense.bias": "model-00002-of-00003.safetensors",
918
+ "encoder.vision_tower.encoder.layers.3.blocks.0.output.dense.weight": "model-00002-of-00003.safetensors",
919
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.output.dense.bias": "model-00002-of-00003.safetensors",
920
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.output.dense.weight": "model-00002-of-00003.safetensors",
921
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.key.bias": "model-00002-of-00003.safetensors",
922
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.key.weight": "model-00002-of-00003.safetensors",
923
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.query.bias": "model-00002-of-00003.safetensors",
924
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.query.weight": "model-00002-of-00003.safetensors",
925
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.relative_position_bias_table": "model-00002-of-00003.safetensors",
926
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.relative_position_index": "model-00002-of-00003.safetensors",
927
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.value.bias": "model-00002-of-00003.safetensors",
928
+ "encoder.vision_tower.encoder.layers.3.blocks.1.attention.self.value.weight": "model-00002-of-00003.safetensors",
929
+ "encoder.vision_tower.encoder.layers.3.blocks.1.intermediate.dense.bias": "model-00002-of-00003.safetensors",
930
+ "encoder.vision_tower.encoder.layers.3.blocks.1.intermediate.dense.weight": "model-00002-of-00003.safetensors",
931
+ "encoder.vision_tower.encoder.layers.3.blocks.1.layernorm_after.bias": "model-00002-of-00003.safetensors",
932
+ "encoder.vision_tower.encoder.layers.3.blocks.1.layernorm_after.weight": "model-00002-of-00003.safetensors",
933
+ "encoder.vision_tower.encoder.layers.3.blocks.1.layernorm_before.bias": "model-00002-of-00003.safetensors",
934
+ "encoder.vision_tower.encoder.layers.3.blocks.1.layernorm_before.weight": "model-00002-of-00003.safetensors",
935
+ "encoder.vision_tower.encoder.layers.3.blocks.1.output.dense.bias": "model-00002-of-00003.safetensors",
936
+ "encoder.vision_tower.encoder.layers.3.blocks.1.output.dense.weight": "model-00002-of-00003.safetensors",
937
  "lm_head.weight": "model-00003-of-00003.safetensors",
938
  "shared.weight": "model-00001-of-00003.safetensors"
939
  }