VedantPadwal commited on
Commit
6b8a59c
1 Parent(s): 5fdf30c

Upload 17 files

Browse files
attn_mask.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ from dataclasses import dataclass
15
+ from typing import List, Optional, Tuple, Union
16
+ from utils import FloatTensor
17
+
18
+ import mlx.core as mx
19
+
20
+ # Custom function to mimic torch.finfo
21
+ def get_finfo_min(dtype: mx.Dtype):
22
+ dtype_str = str(dtype)
23
+ if dtype_str == 'float32':
24
+ return -3.4028235e+38 # Minimum value for float32
25
+ elif dtype_str == 'float64':
26
+ return -1.7976931348623157e+308 # Minimum value for float64
27
+ elif dtype_str == 'float16':
28
+ return -65504.0 # Minimum value for float16
29
+ else:
30
+ raise ValueError(f"Unsupported data type: {dtype_str}")
31
+
32
+ @dataclass
33
+ class AttentionMaskConverter:
34
+
35
+ is_causal: bool
36
+ sliding_window: Optional[int]
37
+
38
+ def __init__(self, is_causal: bool, sliding_window: Optional[int] = None):
39
+ self.is_causal = is_causal
40
+ self.sliding_window = sliding_window
41
+
42
+ if self.sliding_window is not None and self.sliding_window <= 0:
43
+ raise ValueError(
44
+ f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`"
45
+ )
46
+
47
+ def to_causal_4d(
48
+ self,
49
+ batch_size: int,
50
+ query_length: int,
51
+ key_value_length: int,
52
+ dtype: mx.Dtype,
53
+ device: Union[mx.Device, "str"] = "cpu",
54
+ ) -> Optional[mx.array]:
55
+ """
56
+ Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
57
+ bias to upper right hand triangular matrix (causal mask).
58
+ """
59
+ if not self.is_causal:
60
+ raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.")
61
+
62
+ # If shape is not cached, create a new causal mask and cache it
63
+ input_shape = (batch_size, query_length)
64
+ past_key_values_length = key_value_length - query_length
65
+
66
+ # create causal mask
67
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
68
+ causal_4d_mask = None
69
+ if input_shape[-1] > 1 or self.sliding_window is not None:
70
+ causal_4d_mask = self._make_causal_mask(
71
+ input_shape,
72
+ dtype,
73
+ device=device,
74
+ past_key_values_length=past_key_values_length,
75
+ sliding_window=self.sliding_window,
76
+ )
77
+
78
+ return causal_4d_mask
79
+
80
+ def to_4d(
81
+ self,
82
+ attention_mask_2d: mx.array,
83
+ query_length: int,
84
+ dtype: mx.Dtype,
85
+ key_value_length: Optional[int] = None,
86
+ ) -> mx.array:
87
+ """
88
+ Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
89
+ key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is
90
+ causal, a causal mask will be added.
91
+ """
92
+ input_shape = (attention_mask_2d.shape[0], query_length)
93
+
94
+ # create causal mask
95
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
96
+ causal_4d_mask = None
97
+ if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal:
98
+ if key_value_length is None:
99
+ raise ValueError(
100
+ "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask."
101
+ )
102
+
103
+ past_key_values_length = key_value_length - query_length
104
+ causal_4d_mask = self._make_causal_mask(
105
+ input_shape,
106
+ dtype,
107
+ device=attention_mask_2d.device,
108
+ past_key_values_length=past_key_values_length,
109
+ sliding_window=self.sliding_window,
110
+ )
111
+ elif self.sliding_window is not None:
112
+ raise NotImplementedError("Sliding window is currently only implemented for causal masking")
113
+
114
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
115
+ expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to(
116
+ attention_mask_2d.device
117
+ )
118
+
119
+ if causal_4d_mask is not None:
120
+ expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), get_finfo_min(dtype))
121
+
122
+ # expanded_attn_mask + causal_4d_mask can cause some overflow
123
+ expanded_4d_mask = expanded_attn_mask
124
+
125
+ return expanded_4d_mask
126
+
127
+ @staticmethod
128
+ def _make_causal_mask(
129
+ input_ids_shape: Tuple[int, int],
130
+ dtype: mx.Dtype,
131
+ device: mx.Device,
132
+ past_key_values_length: int = 0,
133
+ sliding_window: Optional[int] = None,
134
+ ):
135
+ """
136
+ Make causal mask used for bi-directional self-attention.
137
+ """
138
+ bsz, tgt_len = input_ids_shape
139
+ mask = mx.full((tgt_len, tgt_len), get_finfo_min(dtype), device=device)
140
+ mask_cond = mx.arange(tgt_len, device=device)
141
+ mask = mask * (mask_cond[:, None] >= mask_cond[None, :])
142
+
143
+ mask = mask.astype(dtype)
144
+
145
+ if past_key_values_length > 0:
146
+ past_mask = mx.zeros((tgt_len, past_key_values_length), dtype=dtype, device=device)
147
+ mask = mx.concatenate([past_mask, mask], dim=-1)
148
+
149
+ # add lower triangular sliding window mask if necessary
150
+ if sliding_window is not None:
151
+ diagonal = past_key_values_length - sliding_window - 1
152
+ context_mask = mx.tril(mx.ones_like(mask, dtype=mx.bool_), k=diagonal)
153
+ mask = mask * (1 - context_mask.astype(dtype)) + context_mask.astype(dtype) * get_finfo_min(dtype)
154
+
155
+ return mask.expand_dims(axis=0).expand_dims(axis=0).broadcast_to((bsz, 1, tgt_len, tgt_len + past_key_values_length))
156
+
157
+ @staticmethod
158
+ def _expand_mask(mask: mx.array, dtype: mx.Dtype, tgt_len: Optional[int] = None):
159
+ """
160
+ Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
161
+ """
162
+ bsz, src_len = mask.size()
163
+ tgt_len = tgt_len if tgt_len is not None else src_len
164
+
165
+ expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype)
166
+
167
+ inverted_mask = 1.0 - expanded_mask
168
+
169
+ return inverted_mask.masked_fill(inverted_mask.to(mx.bool_), get_finfo_min(dtype))
170
+
171
+ @staticmethod
172
+ def _unmask_unattended(
173
+ expanded_mask: FloatTensor,
174
+ min_dtype: float,
175
+ ):
176
+ # fmt: off
177
+ """
178
+ Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
179
+ using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
180
+ Details: https://github.com/pytorch/pytorch/issues/110213
181
+
182
+ `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
183
+ `attention_mask` is [bsz, src_seq_len].
184
+
185
+ The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.
186
+
187
+ For example, if `expanded_mask` is (e.g. here left-padding case)
188
+ ```
189
+ [[[[0, 0, 0],
190
+ [0, 0, 0],
191
+ [0, 0, 1]]],
192
+ [[[1, 0, 0],
193
+ [1, 1, 0],
194
+ [1, 1, 1]]],
195
+ [[[0, 0, 0],
196
+ [0, 1, 0],
197
+ [0, 1, 1]]]]
198
+ ```
199
+ then the modified `expanded_mask` will be
200
+ ```
201
+ [[[[1, 1, 1], <-- modified
202
+ [1, 1, 1], <-- modified
203
+ [0, 0, 1]]],
204
+ [[[1, 0, 0],
205
+ [1, 1, 0],
206
+ [1, 1, 1]]],
207
+ [[[1, 1, 1], <-- modified
208
+ [0, 1, 0],
209
+ [0, 1, 1]]]]
210
+ ```
211
+ """
212
+ # fmt: on
213
+ if expanded_mask.dtype == mx.bool_:
214
+ raise ValueError(
215
+ "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor."
216
+ )
217
+
218
+ return expanded_mask.mul(~mx.all(expanded_mask == min_dtype, dim=-1, keepdim=True))
219
+
220
+ def _prepare_4d_causal_attention_mask(
221
+ attention_mask: Optional[mx.array],
222
+ input_shape: Union[mx.array, Tuple, List],
223
+ inputs_embeds: mx.array,
224
+ past_key_values_length: int,
225
+ sliding_window: Optional[int] = None,
226
+ ):
227
+ """
228
+ Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
229
+ `(batch_size, key_value_length)`
230
+
231
+ Args:
232
+ attention_mask (`mx.array` or `None`):
233
+ A 2D attention mask of shape `(batch_size, key_value_length)`
234
+ input_shape (`tuple(int)` or `list(int)`):
235
+ The input shape should be a tuple that defines `(batch_size, query_length)`.
236
+ inputs_embeds (`mx.array`):
237
+ The embedded inputs as a torch Tensor.
238
+ past_key_values_length (`int`):
239
+ The length of the key value cache.
240
+ sliding_window (`int`, *optional*):
241
+ If the model uses windowed attention, a sliding window should be passed.
242
+ """
243
+ attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
244
+
245
+ key_value_length = input_shape[-1] + past_key_values_length
246
+
247
+ # 4d mask is passed through the layers
248
+ if attention_mask is not None and len(attention_mask.shape) == 2:
249
+ attention_mask = attn_mask_converter.to_4d(
250
+ attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
251
+ )
252
+ elif attention_mask is not None and len(attention_mask.shape) == 4:
253
+ expected_shape = (input_shape[0], 1, input_shape[1], key_value_length)
254
+ if tuple(attention_mask.shape) != expected_shape:
255
+ raise ValueError(
256
+ f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}."
257
+ )
258
+ else:
259
+ # if the 4D mask has correct shape - invert it and fill with negative infinity
260
+ inverted_mask = 1.0 - attention_mask
261
+ attention_mask = inverted_mask.masked_fill(
262
+ inverted_mask.to(mx.bool_), get_finfo_min(inputs_embeds.dtype)
263
+ )
264
+ else:
265
+ attention_mask = attn_mask_converter.to_causal_4d(
266
+ input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
267
+ )
268
+
269
+ return attention_mask
config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Phi-3-vision-128k-instruct",
3
+ "architectures": [
4
+ "Phi3VForCausalLM"
5
+ ],
6
+ "attention_dropout": 0.0,
7
+ "auto_map": {
8
+ "AutoConfig": "configuration_phi3_v.Phi3VConfig",
9
+ "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM"
10
+ },
11
+ "bos_token_id": 1,
12
+ "embd_layer": {
13
+ "embedding_cls": "image",
14
+ "hd_transform_order": "sub_glb",
15
+ "projection_cls": "mlp",
16
+ "use_hd_transform": true,
17
+ "with_learnable_separator": true
18
+ },
19
+ "eos_token_id": 2,
20
+ "hidden_act": "silu",
21
+ "hidden_size": 3072,
22
+ "img_processor": {
23
+ "image_dim_out": 1024,
24
+ "model_name": "openai/clip-vit-large-patch14-336",
25
+ "name": "clip_vision_model",
26
+ "num_img_tokens": 144
27
+ },
28
+ "initializer_range": 0.02,
29
+ "intermediate_size": 8192,
30
+ "max_position_embeddings": 131072,
31
+ "model_type": "phi3_v",
32
+ "num_attention_heads": 32,
33
+ "num_hidden_layers": 32,
34
+ "num_key_value_heads": 32,
35
+ "original_max_position_embeddings": 4096,
36
+ "rms_norm_eps": 1e-05,
37
+ "rope_scaling": {
38
+ "long_factor": [
39
+ 1.0299999713897705,
40
+ 1.0499999523162842,
41
+ 1.0499999523162842,
42
+ 1.0799999237060547,
43
+ 1.2299998998641968,
44
+ 1.2299998998641968,
45
+ 1.2999999523162842,
46
+ 1.4499999284744263,
47
+ 1.5999999046325684,
48
+ 1.6499998569488525,
49
+ 1.8999998569488525,
50
+ 2.859999895095825,
51
+ 3.68999981880188,
52
+ 5.419999599456787,
53
+ 5.489999771118164,
54
+ 5.489999771118164,
55
+ 9.09000015258789,
56
+ 11.579999923706055,
57
+ 15.65999984741211,
58
+ 15.769999504089355,
59
+ 15.789999961853027,
60
+ 18.360000610351562,
61
+ 21.989999771118164,
62
+ 23.079999923706055,
63
+ 30.009998321533203,
64
+ 32.35000228881836,
65
+ 32.590003967285156,
66
+ 35.56000518798828,
67
+ 39.95000457763672,
68
+ 53.840003967285156,
69
+ 56.20000457763672,
70
+ 57.95000457763672,
71
+ 59.29000473022461,
72
+ 59.77000427246094,
73
+ 59.920005798339844,
74
+ 61.190006256103516,
75
+ 61.96000671386719,
76
+ 62.50000762939453,
77
+ 63.3700065612793,
78
+ 63.48000717163086,
79
+ 63.48000717163086,
80
+ 63.66000747680664,
81
+ 63.850006103515625,
82
+ 64.08000946044922,
83
+ 64.760009765625,
84
+ 64.80001068115234,
85
+ 64.81001281738281,
86
+ 64.81001281738281
87
+ ],
88
+ "short_factor": [
89
+ 1.05,
90
+ 1.05,
91
+ 1.05,
92
+ 1.1,
93
+ 1.1,
94
+ 1.1,
95
+ 1.2500000000000002,
96
+ 1.2500000000000002,
97
+ 1.4000000000000004,
98
+ 1.4500000000000004,
99
+ 1.5500000000000005,
100
+ 1.8500000000000008,
101
+ 1.9000000000000008,
102
+ 2.000000000000001,
103
+ 2.000000000000001,
104
+ 2.000000000000001,
105
+ 2.000000000000001,
106
+ 2.000000000000001,
107
+ 2.000000000000001,
108
+ 2.000000000000001,
109
+ 2.000000000000001,
110
+ 2.000000000000001,
111
+ 2.000000000000001,
112
+ 2.000000000000001,
113
+ 2.000000000000001,
114
+ 2.000000000000001,
115
+ 2.000000000000001,
116
+ 2.000000000000001,
117
+ 2.000000000000001,
118
+ 2.000000000000001,
119
+ 2.000000000000001,
120
+ 2.000000000000001,
121
+ 2.1000000000000005,
122
+ 2.1000000000000005,
123
+ 2.2,
124
+ 2.3499999999999996,
125
+ 2.3499999999999996,
126
+ 2.3499999999999996,
127
+ 2.3499999999999996,
128
+ 2.3999999999999995,
129
+ 2.3999999999999995,
130
+ 2.6499999999999986,
131
+ 2.6999999999999984,
132
+ 2.8999999999999977,
133
+ 2.9499999999999975,
134
+ 3.049999999999997,
135
+ 3.049999999999997,
136
+ 3.049999999999997
137
+ ],
138
+ "type": "su"
139
+ },
140
+ "rope_theta": 10000.0,
141
+ "sliding_window": 131072,
142
+ "tie_word_embeddings": false,
143
+ "torch_dtype": "bfloat16",
144
+ "transformers_version": "4.38.1",
145
+ "use_cache": true,
146
+ "vocab_size": 32064,
147
+ "_attn_implementation": "eager"
148
+ }
configuration_phi3_v.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ import inspect
3
+ import logging
4
+ from typing import Optional, List, Union, Dict, Tuple, Any
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ import mlx.core as mx
7
+
8
+ PHI3V_PRETRAINED_CONFIG_ARCHIVE_MAP = {
9
+ "microsoft/Phi-3-vision-128k-instruct": "https://huggingface.co/microsoft/Phi-3-vision-128k-instruct/resolve/main/config.json",
10
+ }
11
+
12
+ class Phi3VConfig(PretrainedConfig):
13
+ model_type = "phi3_v"
14
+ keys_to_ignore_at_inference = ["past_key_values"]
15
+
16
+ def __init__(
17
+ self,
18
+ vocab_size=32064,
19
+ hidden_size=3072,
20
+ intermediate_size=8192,
21
+ num_hidden_layers=32,
22
+ num_attention_heads=32,
23
+ num_key_value_heads=None,
24
+ resid_pdrop=0.0,
25
+ embd_pdrop=0.0,
26
+ attention_dropout=0.0,
27
+ hidden_act="silu",
28
+ max_position_embeddings=4096,
29
+ original_max_position_embeddings=4096,
30
+ initializer_range=0.02,
31
+ rms_norm_eps=1e-5,
32
+ use_cache=True,
33
+ tie_word_embeddings=False,
34
+ rope_theta=10000.0,
35
+ rope_scaling=None,
36
+ bos_token_id=1,
37
+ eos_token_id=32000,
38
+ pad_token_id=32000,
39
+ sliding_window=None,
40
+ embd_layer: str = "default",
41
+ **kwargs,
42
+ ):
43
+ self.vocab_size = vocab_size
44
+ self.hidden_size = hidden_size
45
+ self.intermediate_size = intermediate_size
46
+ self.num_hidden_layers = num_hidden_layers
47
+ self.num_attention_heads = num_attention_heads
48
+
49
+ if num_key_value_heads is None:
50
+ num_key_value_heads = num_attention_heads
51
+
52
+ self.num_key_value_heads = num_key_value_heads
53
+ self.resid_pdrop = resid_pdrop
54
+ self.embd_pdrop = embd_pdrop
55
+ self.attention_dropout = attention_dropout
56
+ self.hidden_act = hidden_act
57
+ self.max_position_embeddings = max_position_embeddings
58
+ self.original_max_position_embeddings = original_max_position_embeddings
59
+ self.initializer_range = initializer_range
60
+ self.rms_norm_eps = rms_norm_eps
61
+ self.use_cache = use_cache
62
+ self.rope_theta = rope_theta
63
+ self.rope_scaling = rope_scaling
64
+ self._rope_scaling_validation()
65
+ self.sliding_window = sliding_window
66
+ self.embd_layer = embd_layer
67
+
68
+
69
+ super().__init__(
70
+ bos_token_id=bos_token_id,
71
+ eos_token_id=eos_token_id,
72
+ pad_token_id=pad_token_id,
73
+ tie_word_embeddings=tie_word_embeddings,
74
+ **kwargs,
75
+ )
76
+
77
+ def _rope_scaling_validation(self):
78
+ """
79
+ Validate the `rope_scaling` configuration.
80
+ """
81
+ if self.rope_scaling is None:
82
+ return
83
+
84
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
85
+ raise ValueError(
86
+ "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
87
+ f"got {self.rope_scaling}"
88
+ )
89
+ rope_scaling_type = self.rope_scaling.get("type", None)
90
+ rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
91
+ rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
92
+ if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
93
+ raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
94
+ if not (
95
+ isinstance(rope_scaling_short_factor, list)
96
+ and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
97
+ ):
98
+ raise ValueError(
99
+ f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
100
+ )
101
+ if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
102
+ raise ValueError(
103
+ f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
104
+ )
105
+ if not (
106
+ isinstance(rope_scaling_long_factor, list)
107
+ and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
108
+ ):
109
+ raise ValueError(
110
+ f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
111
+ )
112
+ if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
113
+ raise ValueError(
114
+ f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
115
+ )
gen.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import requests
3
+ from transformers import AutoProcessor
4
+ from modeling_phi3_v import Phi3VModel
5
+ model_path = "./"
6
+
7
+ # kwargs = {}
8
+ # kwargs['torch_dtype'] = torch.bfloat16
9
+
10
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
11
+ model = Phi3VModel.from_pretrained(model_path)
12
+
13
+ user_prompt = '<|user|>\n'
14
+ assistant_prompt = '<|assistant|>\n'
15
+ prompt_suffix = "<|end|>\n"
16
+
17
+ #################################################### text-only ####################################################
18
+ # single-image prompt
19
+ prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
20
+ print(f">>> Prompt\n{prompt}")
21
+ inputs = processor(prompt, images=None, return_tensors="pt")
22
+ generate_ids = model.generate(**inputs,
23
+ max_new_tokens=1000,
24
+ eos_token_id=processor.tokenizer.eos_token_id,
25
+ )
26
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
27
+ response = processor.batch_decode(generate_ids,
28
+ skip_special_tokens=True,
29
+ clean_up_tokenization_spaces=False)[0]
30
+ print(f'>>> Response\n{response}')
31
+
32
+ #################################################### text-only 2 ####################################################
33
+ # single-image prompt
34
+ prompt = f"{user_prompt}Give me the code for sloving two-sum problem.{prompt_suffix}{assistant_prompt}"
35
+ print(f">>> Prompt\n{prompt}")
36
+ inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
37
+ generate_ids = model.generate(**inputs,
38
+ max_new_tokens=1000,
39
+ eos_token_id=processor.tokenizer.eos_token_id,
40
+ )
41
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
42
+ response = processor.batch_decode(generate_ids,
43
+ skip_special_tokens=True,
44
+ clean_up_tokenization_spaces=False)[0]
45
+ print(f'>>> Response\n{response}')
46
+
47
+
48
+ #################################################### EXAMPLE 1 ####################################################
49
+ # single-image prompt
50
+ prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
51
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
52
+ print(f">>> Prompt\n{prompt}")
53
+ image = Image.open(requests.get(url, stream=True).raw)
54
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
55
+ generate_ids = model.generate(**inputs,
56
+ max_new_tokens=1000,
57
+ eos_token_id=processor.tokenizer.eos_token_id,
58
+ )
59
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
60
+ response = processor.batch_decode(generate_ids,
61
+ skip_special_tokens=True,
62
+ clean_up_tokenization_spaces=False)[0]
63
+ print(f'>>> Response\n{response}')
64
+
65
+ #################################################### EXAMPLE 2 ####################################################
66
+ # multiple image prompt
67
+ # Note: image tokens must start from <|image_1|>
68
+ prompt = f"{user_prompt}<|image_1|>\n<|image_2|>\n What is shown in this two images?{prompt_suffix}{assistant_prompt}"
69
+ print(f">>> Prompt\n{prompt}")
70
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
71
+ image_1 = Image.open(requests.get(url, stream=True).raw)
72
+ url = "https://img.freepik.com/free-photo/painting-mountain-lake-with-mountain-background_188544-9126.jpg?w=2000"
73
+ image_2 = Image.open(requests.get(url, stream=True).raw)
74
+ images = [image_1, image_2]
75
+ inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
76
+ generate_ids = model.generate(**inputs,
77
+ max_new_tokens=1000,
78
+ eos_token_id=processor.tokenizer.eos_token_id,
79
+ )
80
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
81
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
82
+ print(f'>>> Response\n{response}')
83
+
84
+ #################################################### EXAMPLE 3 ####################################################
85
+ # chat template
86
+ chat = [
87
+ {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
88
+ {"role": "assistant", "content": "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by."},
89
+ {"role": "user", "content": "What is so special about this image"}
90
+ ]
91
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
92
+ image = Image.open(requests.get(url, stream=True).raw)
93
+ prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
94
+ # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
95
+ if prompt.endswith("<|endoftext|>"):
96
+ prompt = prompt.rstrip("<|endoftext|>")
97
+
98
+ print(f">>> Prompt\n{prompt}")
99
+
100
+ inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
101
+ generate_ids = model.generate(**inputs,
102
+ max_new_tokens=1000,
103
+ eos_token_id=processor.tokenizer.eos_token_id,
104
+ )
105
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
106
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
107
+ print(f'>>> Response\n{response}')
108
+
109
+
110
+ ############################# to markdown #############################
111
+ # single-image prompt
112
+ prompt = f"{user_prompt}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
113
+ url = "https://support.content.office.net/en-us/media/3dd2b79b-9160-403d-9967-af893d17b580.png"
114
+ image = Image.open(requests.get(url, stream=True).raw)
115
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
116
+
117
+ print(f">>> Prompt\n{prompt}")
118
+ generate_ids = model.generate(**inputs,
119
+ max_new_tokens=1000,
120
+ eos_token_id=processor.tokenizer.eos_token_id,
121
+ )
122
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
123
+ response = processor.batch_decode(generate_ids,
124
+ skip_special_tokens=False,
125
+ clean_up_tokenization_spaces=False)[0]
126
+ print(f'>>> Response\n{response}')
image_embedding_phi3_v.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import math
17
+ import torch
18
+ import torch.nn as nn
19
+ from transformers import CLIPVisionModel, PretrainedConfig
20
+ from transformers import CLIPVisionConfig
21
+ from transformers.utils import logging
22
+ from datetime import datetime
23
+
24
+ logger = logging.get_logger(__name__)
25
+
26
+ CLIP_VIT_LARGE_PATCH14_336_CONFIG = CLIPVisionConfig(
27
+ attention_dropout=0.0,
28
+ dropout=0.0,
29
+ hidden_act="quick_gelu",
30
+ hidden_size=1024,
31
+ image_size=336,
32
+ initializer_factor=1.0,
33
+ initializer_range=0.02,
34
+ intermediate_size=4096,
35
+ layer_norm_eps=1e-05,
36
+ num_attention_heads=16,
37
+ num_channels=3,
38
+ num_hidden_layers=24,
39
+ patch_size=14,
40
+ projection_dim=768
41
+ )
42
+
43
+ class Phi3ImageEmbedding(nn.Module):
44
+ """Phi3 Image embedding."""
45
+
46
+ def __init__(self, config: PretrainedConfig, wte=None, **kwargs) -> None:
47
+ super().__init__()
48
+
49
+ # n_embed or hidden_size
50
+ hidden_size = config.n_embd if hasattr(config, 'n_embd') else config.hidden_size
51
+ if hasattr(config, 'embd_pdrop') or hasattr(config, 'embed_pdrop'):
52
+ embd_drop = config.embd_pdrop if hasattr(config, 'embd_pdrop') else config.embed_pdrop
53
+ self.drop = nn.Dropout(embd_drop)
54
+ else:
55
+ self.drop = None
56
+
57
+ self.wte = wte
58
+
59
+ if isinstance(config.img_processor, dict) and config.img_processor.get('name', None) == 'clip_vision_model':
60
+ assert 'model_name' in config.img_processor, 'model_name must be provided for CLIPVisionModel'
61
+ assert 'image_dim_out' in config.img_processor, 'image_dim_out must be provided for CLIPVisionModel'
62
+ assert 'num_img_tokens' in config.img_processor, 'num_img_tokens must be provided for CLIPVisionModel'
63
+ assert config.img_processor['model_name'] == 'openai/clip-vit-large-patch14-336'
64
+ clip_config = CLIP_VIT_LARGE_PATCH14_336_CONFIG
65
+ self.img_processor = CLIPVisionModel(clip_config)
66
+ image_dim_out = config.img_processor['image_dim_out']
67
+ self.num_img_tokens = config.img_processor['num_img_tokens']
68
+ else:
69
+ raise NotImplementedError(f'img_processor = {config.img_processor}, not implemented')
70
+
71
+ self.image_dim_out = image_dim_out
72
+ self.img_sizes = None
73
+
74
+ # global_gn and sub_gn for hd transform, serves as line separator
75
+ self.use_hd_transform = kwargs.get('use_hd_transform', False)
76
+ self.with_learnable_separator = kwargs.get('with_learnable_separator', False)
77
+ self.hd_transform_order = kwargs.get('hd_transform_order', 'glb_sub')
78
+ # with_hd_transform and with_learnable_separator should have same value
79
+ assert self.use_hd_transform == self.with_learnable_separator, 'use_hd_transform and with_learnable_separator should have same value'
80
+ if self.with_learnable_separator:
81
+ assert self.use_hd_transform, 'learnable separator is only for hd transform'
82
+ # 1024 * 4, merge spatial to channel dimension
83
+ self.glb_GN = nn.Parameter(torch.zeros([1, 1, self.image_dim_out * 4]))
84
+ self.sub_GN = nn.Parameter(torch.zeros([1, 1, 1, self.image_dim_out * 4]))
85
+ logger.info(f'learnable separator enabled for hd transform, hd_transform_order = {self.hd_transform_order}')
86
+
87
+ projection_cls = kwargs.get('projection_cls', 'linear')
88
+ if projection_cls == 'linear':
89
+ self.img_projection = nn.Linear(image_dim_out, hidden_size)
90
+ elif projection_cls == 'mlp' and self.use_hd_transform:
91
+ dim_projection = hidden_size
92
+ depth = 2
93
+ layers = [nn.Linear(image_dim_out * 4, dim_projection)]
94
+ for _ in range(1, depth):
95
+ layers.extend([nn.GELU(),
96
+ nn.Linear(dim_projection, dim_projection)])
97
+ self.img_projection = nn.Sequential(*layers)
98
+ elif projection_cls == 'mlp':
99
+ dim_projection = hidden_size
100
+ depth = 2
101
+ layers = [nn.Linear(image_dim_out, dim_projection)]
102
+ for _ in range(1, depth):
103
+ layers.extend([nn.GELU(),
104
+ nn.Linear(dim_projection, dim_projection)])
105
+ self.img_projection = nn.Sequential(*layers)
106
+ else:
107
+ raise NotImplementedError(f'projection_cls = {projection_cls}, not implemented')
108
+
109
+ self.vocab_size = config.vocab_size
110
+ self.img_features = None
111
+
112
+ if isinstance(config.img_processor, dict):
113
+ self.layer_idx = config.img_processor.get('layer_idx', -2)
114
+ self.type_feature = config.img_processor.get('type_feature', 'patch')
115
+ else:
116
+ self.layer_idx = -2
117
+ self.type_feature = 'patch'
118
+
119
+
120
+ def set_img_features(self, img_features: torch.FloatTensor) -> None:
121
+ self.img_features = img_features
122
+
123
+ def set_img_sizes(self, img_sizes: torch.LongTensor) -> None:
124
+ self.img_sizes = img_sizes
125
+
126
+ def get_img_features(self, img_embeds: torch.FloatTensor) -> torch.FloatTensor:
127
+ LAYER_IDX = self.layer_idx
128
+ TYPE_FEATURE = self.type_feature
129
+
130
+ img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
131
+ img_feature = img_processor_output.hidden_states[LAYER_IDX]
132
+
133
+ if TYPE_FEATURE == "patch":
134
+ patch_feature = img_feature[:, 1:]
135
+ return patch_feature
136
+
137
+ if TYPE_FEATURE == "cls_patch":
138
+ return img_feature
139
+
140
+ raise NotImplementedError
141
+
142
+ def forward(self, input_ids: torch.LongTensor, pixel_values: torch.FloatTensor, image_sizes=None) -> torch.FloatTensor:
143
+
144
+ MAX_INPUT_ID = int(1e9)
145
+ img_embeds = pixel_values
146
+ img_sizes = image_sizes
147
+
148
+ if self.img_features is not None:
149
+ img_embeds = self.img_features.clone()
150
+ self.img_features = None
151
+
152
+ if self.img_sizes is not None:
153
+ img_sizes = self.img_sizes
154
+
155
+ input_shape = input_ids.size()
156
+ input_ids = input_ids.view(-1, input_shape[-1])
157
+
158
+ with torch.no_grad():
159
+ positions = torch.nonzero((input_ids < 0) & (input_ids > -MAX_INPUT_ID), as_tuple=False)
160
+
161
+ select = False
162
+
163
+ if isinstance(self.img_projection, nn.Sequential):
164
+ target_device = self.img_projection[0].bias.device
165
+ target_dtype = self.img_projection[0].bias.dtype
166
+ else: # It's a single nn.Linear layer
167
+ target_device = self.img_projection.bias.device
168
+ target_dtype = self.img_projection.bias.dtype
169
+
170
+ if len(positions.tolist()) > 0:
171
+ with torch.no_grad():
172
+ g_values = abs(input_ids[positions[:, 0], positions[:, 1]])
173
+
174
+ if self.use_hd_transform and img_sizes is not None and len(img_sizes):
175
+ hd_transform = True
176
+ assert img_embeds.ndim == 5, f'img_embeds size: {img_embeds.size()}, expect 5D tensor for hd transform'
177
+ # img_embeds: (num_images, max_num_crops, 3, H, W)
178
+ # img_sizes: (num_images, 2).view(1, -1)
179
+
180
+ start_time = datetime.now()
181
+ bs = img_embeds.shape[0]
182
+ # Nx(HW)xC
183
+ img_features = self.get_img_features(img_embeds.flatten(0, 1))
184
+ base_feat_height = base_feat_width = int(img_features.shape[1] ** 0.5)
185
+
186
+ assert base_feat_height == 24 and base_feat_width == 24, f'base_feat_height: {base_feat_height}, base_feat_width: {base_feat_width}, expect 24x24 features for hd transform'
187
+
188
+ # bs x max_num_crops x (24x24) x C
189
+ img_features = img_features.view(bs, -1, base_feat_height * base_feat_width, self.image_dim_out)
190
+ C = self.image_dim_out
191
+ H = base_feat_height
192
+
193
+ output_imgs = []
194
+ output_len = []
195
+ # training is tensor, inference is list
196
+ if isinstance(img_sizes, torch.Tensor):
197
+ img_sizes = img_sizes.view(-1, 2)
198
+ for _bs in range(bs):
199
+ h, w = img_sizes[_bs]
200
+ h = h // 336
201
+ w = w // 336
202
+ B_ = h * w
203
+
204
+ # 1 x (24x24) x 1024
205
+ global_img_feature = img_features[_bs, :1]
206
+
207
+ # 1 x 12 x 12 x 4096
208
+ glb_img = global_img_feature.reshape(1,H,H,C).reshape(1,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(1,H//2,H//2,4*C).contiguous()
209
+ temp_glb_GN = self.sub_GN.repeat(1, H//2, 1, 1)
210
+
211
+ # 1 x 156 x 4096
212
+ glb_img = torch.cat([glb_img, temp_glb_GN], dim=2).reshape(1,-1,4*C)
213
+
214
+ # (max_num_crops-1) x (12x12) x C
215
+ sub_img = img_features[_bs, 1:]
216
+ # 16x574x1024
217
+ # get rid of padding sub_img
218
+ sub_img = sub_img[:B_]
219
+
220
+ # (num_crops, 12, 2, 12, 2, 1024) -> (num_crops, 12, 12, 2, 2, 1024) -> (num_crops, 12*12, 4*1024)
221
+ sub_img = sub_img.reshape(B_,H,H,C).reshape(B_,H//2,2,H//2,2,C).contiguous().permute(0,1,3,2,4,5).reshape(B_,-1,4*C).contiguous()
222
+ sub_img = sub_img.reshape(1, h, w, 12, 12, -1).permute(0,1,3,2,4,5).reshape(1,h*12,w*12,4*C)
223
+ temp_sub_GN = self.sub_GN.repeat(1, h*12, 1, 1)
224
+ sub_img = torch.cat([sub_img, temp_sub_GN], dim=2).reshape(1,-1,4*C)
225
+ # (1, num_img_tokens, 1024*4)
226
+
227
+ # glb + sub
228
+ if self.hd_transform_order == 'glb_sub':
229
+ output_imgs.append(torch.cat([glb_img, self.glb_GN, sub_img], dim=1))
230
+ elif self.hd_transform_order == 'sub_glb':
231
+ output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1))
232
+ else:
233
+ raise NotImplementedError(f'hd_transform_order = {self.hd_transform_order}, not implemented')
234
+
235
+ temp_len = int((h*w+1)*144 + 1 + (h+1)*12)
236
+ assert temp_len == output_imgs[-1].shape[1], f'temp_len: {temp_len}, output_imgs[-1].shape[1]: {output_imgs[-1].shape[1]}'
237
+ output_len.append(temp_len)
238
+
239
+ num_img_tokens = output_len
240
+ img_set_tensor = []
241
+ for _output_img in output_imgs:
242
+ img_feature_proj = self.img_projection(_output_img.to(target_device).to(target_dtype))
243
+ img_set_tensor.append(img_feature_proj)
244
+ logger.info(f'img_embeds size: {img_embeds.size()}, image sizes: {img_sizes} loading time {datetime.now() - start_time}')
245
+ elif img_embeds.ndim == 4:
246
+ selected_g_values = g_values[::self.num_img_tokens]
247
+ assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
248
+ start_time = datetime.now()
249
+ tt = (
250
+ self.get_img_features(img_embeds)
251
+ .to(target_device)
252
+ .to(target_dtype)
253
+ .reshape(-1, self.image_dim_out)
254
+ )
255
+ logger.info(f'img_embeds size: {img_embeds.size()}, loading time {datetime.now() - start_time}')
256
+ img_set_tensor = self.img_projection(tt) # adapted visual features.
257
+ elif img_embeds.ndim == 3:
258
+ selected_g_values = g_values[::self.num_img_tokens]
259
+ assert len(img_embeds) == len(selected_g_values), f'img_embeds size: {img_embeds.size()}, selected_g_values size: {len(selected_g_values)}, selected_g_value {selected_g_values}'
260
+ tt = (
261
+ img_embeds
262
+ .to(target_device)
263
+ .to(target_dtype)
264
+ .view(-1, self.image_dim_out)
265
+ )
266
+ img_set_tensor = self.img_projection(tt) # adapted visual features.
267
+ else:
268
+ raise NotImplementedError
269
+ select = True
270
+
271
+ with torch.no_grad():
272
+ input_ids.clamp_min_(0).clamp_max_(self.vocab_size)
273
+
274
+ hidden_states = self.wte(input_ids)
275
+
276
+ if select:
277
+ if hd_transform:
278
+ idx = 0
279
+ for i, cnt in enumerate(num_img_tokens):
280
+ hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
281
+ img_set_tensor[i]
282
+ .to(hidden_states.dtype)
283
+ .to(hidden_states.device)
284
+ )
285
+ idx += cnt
286
+ else:
287
+ idx = 0
288
+ assert len(selected_g_values) * self.num_img_tokens == len(img_set_tensor), f'len(selected_g_values) * self.num_img_tokens = {len(selected_g_values) * self.num_img_tokens}, len(img_set_tensor) = {len(img_set_tensor)}'
289
+ for i, g in enumerate(selected_g_values):
290
+ cnt = self.num_img_tokens
291
+ hidden_states[positions[idx, 0], positions[idx, 1] : positions[idx, 1] + cnt] = (
292
+ img_set_tensor[i * cnt : (i + 1) * cnt]
293
+ .to(hidden_states.dtype)
294
+ .to(hidden_states.device)
295
+ )
296
+ idx += cnt
297
+
298
+ if self.drop is not None:
299
+ hidden_states = self.drop(hidden_states)
300
+
301
+ return hidden_states
image_processing_phi3_v.py ADDED
@@ -0,0 +1,274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """Image processor class for Phi3-V."""
17
+
18
+ from typing import List, Optional, Union
19
+
20
+ import numpy as np
21
+
22
+ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
23
+ from transformers.image_transforms import (
24
+ convert_to_rgb,
25
+ )
26
+ from transformers.image_utils import (
27
+ OPENAI_CLIP_MEAN,
28
+ OPENAI_CLIP_STD,
29
+ ImageInput,
30
+ make_list_of_images,
31
+ valid_images,
32
+ )
33
+ from transformers.utils import TensorType, is_vision_available, logging
34
+
35
+ from transformers import AutoImageProcessor
36
+
37
+ logger = logging.get_logger(__name__)
38
+
39
+
40
+ if is_vision_available():
41
+ from PIL import Image
42
+
43
+ import torch
44
+ import torchvision
45
+
46
+ def padding_336(b):
47
+ width, height = b.size
48
+ tar = int(np.ceil(height / 336) * 336)
49
+ top_padding = int((tar - height)/2)
50
+ bottom_padding = tar - height - top_padding
51
+ left_padding = 0
52
+ right_padding = 0
53
+ b = torchvision.transforms.functional.pad(b, [left_padding, top_padding, right_padding, bottom_padding], fill=[255,255,255])
54
+
55
+ return b
56
+
57
+ def calc_padded_size(width, height, padding_unit=336):
58
+ target_height = int(np.ceil(height / padding_unit) * padding_unit)
59
+ top_padding = int((target_height - height) / 2)
60
+ bottom_padding = target_height - height - top_padding
61
+ left_padding = 0
62
+ right_padding = 0
63
+ padded_width = width + left_padding + right_padding
64
+ padded_height = height + top_padding + bottom_padding
65
+ return padded_width, padded_height
66
+
67
+ def HD_transform(img, hd_num=16):
68
+ width, height = img.size
69
+ trans = False
70
+ if width < height:
71
+ img = img.transpose(Image.TRANSPOSE)
72
+ trans = True
73
+ width, height = img.size
74
+ ratio = (width/ height)
75
+ scale = 1
76
+ while scale*np.ceil(scale/ratio) <= hd_num:
77
+ scale += 1
78
+ scale -= 1
79
+ new_w = int(scale * 336)
80
+ new_h = int(new_w / ratio)
81
+
82
+ img = torchvision.transforms.functional.resize(img, [new_h, new_w],)
83
+ img = padding_336(img)
84
+ width, height = img.size
85
+ if trans:
86
+ img = img.transpose(Image.TRANSPOSE)
87
+
88
+ return img
89
+
90
+ def calc_hd_transform_size(width, height, hd_num=16):
91
+ transposed = False
92
+ if width < height:
93
+ width, height = height, width
94
+ transposed = True
95
+
96
+ ratio = width / height
97
+ scale = 1
98
+ while scale * np.ceil(scale / ratio) <= hd_num:
99
+ scale += 1
100
+ scale -= 1
101
+
102
+ new_width = int(scale * 336)
103
+ new_height = int(new_width / ratio)
104
+
105
+ padded_width, padded_height = calc_padded_size(new_width, new_height)
106
+
107
+ if transposed:
108
+ padded_width, padded_height = padded_height, padded_width
109
+
110
+ return padded_width, padded_height
111
+
112
+ def pad_to_max_num_crops_tensor(images, max_crops=5):
113
+ """
114
+ images: B x 3 x H x W, B<=max_crops
115
+ """
116
+ B, _, H, W = images.shape
117
+ if B < max_crops:
118
+ pad = torch.zeros(max_crops - B, 3, H, W, dtype=images.dtype, device=images.device)
119
+ images = torch.cat([images, pad], dim=0)
120
+ return images
121
+
122
+
123
+ class Phi3VImageProcessor(BaseImageProcessor):
124
+ r"""
125
+ Constructs a Phi3 image processor. Based on [`CLIPImageProcessor`] with incorporation of additional techniques
126
+ for processing high resolution images as explained in the [InternLM-XComposer2-4KHD](https://arxiv.org/abs/2401.16420)
127
+
128
+ Args:
129
+ image_mean (`float` or `List[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
130
+ Mean to use if normalizing the image. This is a float or list of floats the length of the number of
131
+ channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
132
+ image_std (`float` or `List[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
133
+ Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
134
+ number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
135
+ Can be overridden by the `image_std` parameter in the `preprocess` method.
136
+ do_convert_rgb (`bool`, *optional*, defaults to `True`):
137
+ Whether to convert the image to RGB.
138
+ """
139
+
140
+ model_input_names = ["pixel_values"]
141
+
142
+ def __init__(
143
+ self,
144
+ num_crops: int = 1,
145
+ image_mean: Optional[Union[float, List[float]]] = None,
146
+ image_std: Optional[Union[float, List[float]]] = None,
147
+ do_convert_rgb: bool = True,
148
+ **kwargs,
149
+ ) -> None:
150
+ super().__init__(**kwargs)
151
+ self.num_crops = num_crops
152
+ self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
153
+ self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
154
+ self.do_convert_rgb = do_convert_rgb
155
+
156
+ def calc_num_image_tokens(
157
+ self,
158
+ images: ImageInput
159
+ ):
160
+ """ Calculate the number of image tokens for each image.
161
+ Args:
162
+ images (`ImageInput`):
163
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
164
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
165
+ """
166
+ images = make_list_of_images(images)
167
+
168
+ if not valid_images(images):
169
+ raise ValueError(
170
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
171
+ "torch.Tensor, tf.Tensor or jax.ndarray."
172
+ )
173
+
174
+ images = [image.convert('RGB') for image in images]
175
+ # (H, W, C)
176
+ elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
177
+ shapes = [[im.size[1], im.size[0]] for im in elems]
178
+ num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
179
+ return num_img_tokens
180
+
181
+ def calc_num_image_tokens_from_image_size(self, width, height):
182
+ """
183
+ Calculate the number of image tokens for a given image size.
184
+ Args:
185
+ width (`int`): Width of the image.
186
+ height (`int`): Height of the image.
187
+ """
188
+ new_width, new_height = calc_hd_transform_size(width, height, hd_num=self.num_crops)
189
+ num_img_tokens = int((new_height // 336 * new_width // 336 + 1) * 144 + 1 + (new_height // 336 + 1) * 12)
190
+ return num_img_tokens
191
+
192
+ def preprocess(
193
+ self,
194
+ images: ImageInput,
195
+ image_mean: Optional[Union[float, List[float]]] = None,
196
+ image_std: Optional[Union[float, List[float]]] = None,
197
+ do_convert_rgb: bool = None,
198
+ return_tensors: Optional[Union[str, TensorType]] = None,
199
+ ):
200
+ """
201
+ Args:
202
+ images (`ImageInput`):
203
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
204
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
205
+ image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
206
+ Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
207
+ image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
208
+ Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
209
+ `True`.
210
+ do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
211
+ Whether to convert the image to RGB.
212
+ return_tensors (`str` or `TensorType`, *optional*):
213
+ The type of tensors to return. Can be one of:
214
+ - Unset: Return a list of `np.ndarray`.
215
+ - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
216
+ - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
217
+ - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
218
+ - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
219
+ """
220
+ image_mean = image_mean if image_mean is not None else self.image_mean
221
+ image_std = image_std if image_std is not None else self.image_std
222
+ do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
223
+
224
+ images = make_list_of_images(images)
225
+
226
+ if not valid_images(images):
227
+ raise ValueError(
228
+ "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
229
+ "torch.Tensor, tf.Tensor or jax.ndarray."
230
+ )
231
+
232
+ if do_convert_rgb:
233
+ images = [convert_to_rgb(image) for image in images]
234
+
235
+ image_sizes = []
236
+ img_processor = torchvision.transforms.Compose([
237
+ torchvision.transforms.ToTensor(),
238
+ torchvision.transforms.Normalize(image_mean, image_std)
239
+ ])
240
+
241
+ # PIL images
242
+ # HD_transform pad images to size of multiiply of 336, 336
243
+ # convert to RGB first
244
+ images = [image.convert('RGB') for image in images]
245
+ elems = [HD_transform(im, hd_num = self.num_crops) for im in images]
246
+ # tensor transform and normalize
247
+ hd_images = [img_processor(im) for im in elems]
248
+ # create global image
249
+ global_image = [torch.nn.functional.interpolate(im.unsqueeze(0).float(), size=(336, 336), mode='bicubic',).to(im.dtype) for im in hd_images]
250
+
251
+ # [(3, h, w)], where h, w is multiple of 336
252
+ shapes = [[im.size(1), im.size(2)] for im in hd_images]
253
+ num_img_tokens = [int((h//336*w//336+1)*144 + 1 + (h//336+1)*12) for h, w in shapes]
254
+ # reshape to channel dimension -> (num_images, num_crops, 3, 336, 336)
255
+ # (1, 3, h//336, 336, w//336, 336) -> (1, h//336, w//336, 3, 336, 336) -> (h//336*w//336, 3, 336, 336)
256
+ hd_images_reshape = [im.reshape(1, 3, h//336, 336, w//336, 336).permute(0,2,4,1,3,5).reshape(-1, 3, 336, 336).contiguous() for im, (h, w) in zip(hd_images, shapes)]
257
+ # concat global image and local image
258
+ hd_images_reshape = [torch.cat([_global_image] + [_im], dim=0) for _global_image, _im in zip(global_image, hd_images_reshape)]
259
+
260
+ # pad to max_num_crops
261
+ image_transformed = [pad_to_max_num_crops_tensor(im, self.num_crops+1) for im in hd_images_reshape]
262
+ image_transformed = torch.stack(image_transformed, dim=0)
263
+ image_sizes = [torch.LongTensor(_shapes) for _shapes in shapes]
264
+ padded_images = image_transformed
265
+ image_sizes = shapes
266
+
267
+ data = {"pixel_values": padded_images,
268
+ "image_sizes": image_sizes,
269
+ "num_img_tokens": num_img_tokens
270
+ }
271
+
272
+ return BatchFeature(data=data, tensor_type=return_tensors)
273
+
274
+ AutoImageProcessor.register("Phi3VImageProcessor", Phi3VImageProcessor)
model-00001-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:175b2fe918dd8bd2549e3441615ee0c6d7b1f6d638c0104a614546f55c273482
3
+ size 4944122112
model-00002-of-00002.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e61ece5a8f0c9663afa06cc22799056f5cc084fb993518bf036dc8e268fd4c94
3
+ size 3349208776
model.safetensors.index.json ADDED
@@ -0,0 +1,599 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_size": 8293242880
4
+ },
5
+ "weight_map": {
6
+ "lm_head.weight": "model-00002-of-00002.safetensors",
7
+ "model.embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
9
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
10
+ "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
11
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
12
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
13
+ "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
14
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
15
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
16
+ "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
17
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
18
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
19
+ "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
20
+ "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
21
+ "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
22
+ "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
23
+ "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
24
+ "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
25
+ "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
26
+ "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
27
+ "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
28
+ "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
29
+ "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
30
+ "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
31
+ "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
32
+ "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
33
+ "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
34
+ "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
35
+ "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
36
+ "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
37
+ "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
38
+ "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
39
+ "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
40
+ "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
41
+ "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
42
+ "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
43
+ "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
44
+ "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
45
+ "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
46
+ "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
47
+ "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
48
+ "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
49
+ "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
50
+ "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
51
+ "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
52
+ "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
53
+ "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
54
+ "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
55
+ "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
56
+ "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
57
+ "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
58
+ "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
59
+ "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
60
+ "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
61
+ "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
62
+ "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
63
+ "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
64
+ "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
65
+ "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
66
+ "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
67
+ "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
68
+ "model.layers.18.input_layernorm.weight": "model-00002-of-00002.safetensors",
69
+ "model.layers.18.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
70
+ "model.layers.18.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
71
+ "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
72
+ "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
73
+ "model.layers.18.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
74
+ "model.layers.19.input_layernorm.weight": "model-00002-of-00002.safetensors",
75
+ "model.layers.19.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
76
+ "model.layers.19.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
77
+ "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
78
+ "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
79
+ "model.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
80
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
81
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
82
+ "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
83
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
84
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
85
+ "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
86
+ "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors",
87
+ "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
88
+ "model.layers.20.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
89
+ "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
90
+ "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
91
+ "model.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
92
+ "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
93
+ "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
94
+ "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
95
+ "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
96
+ "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
97
+ "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
98
+ "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
99
+ "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
100
+ "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
101
+ "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
102
+ "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
103
+ "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
104
+ "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
105
+ "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
106
+ "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
107
+ "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
108
+ "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
109
+ "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
110
+ "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
111
+ "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
112
+ "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
113
+ "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
114
+ "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
115
+ "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
116
+ "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
117
+ "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
118
+ "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
119
+ "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
120
+ "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
121
+ "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
122
+ "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
123
+ "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
124
+ "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
125
+ "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
126
+ "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
127
+ "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
128
+ "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
129
+ "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
130
+ "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
131
+ "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
132
+ "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
133
+ "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
134
+ "model.layers.28.input_layernorm.weight": "model-00002-of-00002.safetensors",
135
+ "model.layers.28.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
136
+ "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
137
+ "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
138
+ "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
139
+ "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
140
+ "model.layers.29.input_layernorm.weight": "model-00002-of-00002.safetensors",
141
+ "model.layers.29.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
142
+ "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
143
+ "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
144
+ "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
145
+ "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
146
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
147
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
148
+ "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
149
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
150
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
151
+ "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
152
+ "model.layers.30.input_layernorm.weight": "model-00002-of-00002.safetensors",
153
+ "model.layers.30.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
154
+ "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
155
+ "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
156
+ "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
157
+ "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
158
+ "model.layers.31.input_layernorm.weight": "model-00002-of-00002.safetensors",
159
+ "model.layers.31.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
160
+ "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00002.safetensors",
161
+ "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
162
+ "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
163
+ "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
164
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
165
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
166
+ "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
167
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
168
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
169
+ "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
170
+ "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
171
+ "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
172
+ "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
173
+ "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
174
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
175
+ "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
176
+ "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
177
+ "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
178
+ "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
179
+ "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
180
+ "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
181
+ "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
182
+ "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
183
+ "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
184
+ "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
185
+ "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
186
+ "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
187
+ "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
188
+ "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
189
+ "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
190
+ "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
191
+ "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
192
+ "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
193
+ "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
194
+ "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
195
+ "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
196
+ "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00002.safetensors",
197
+ "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
198
+ "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
199
+ "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00002.safetensors",
200
+ "model.norm.weight": "model-00002-of-00002.safetensors",
201
+ "model.vision_embed_tokens.glb_GN": "model-00001-of-00002.safetensors",
202
+ "model.vision_embed_tokens.img_processor.vision_model.embeddings.class_embedding": "model-00001-of-00002.safetensors",
203
+ "model.vision_embed_tokens.img_processor.vision_model.embeddings.patch_embedding.weight": "model-00001-of-00002.safetensors",
204
+ "model.vision_embed_tokens.img_processor.vision_model.embeddings.position_embedding.weight": "model-00001-of-00002.safetensors",
205
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.bias": "model-00001-of-00002.safetensors",
206
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm1.weight": "model-00001-of-00002.safetensors",
207
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.bias": "model-00001-of-00002.safetensors",
208
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.layer_norm2.weight": "model-00001-of-00002.safetensors",
209
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.bias": "model-00001-of-00002.safetensors",
210
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc1.weight": "model-00001-of-00002.safetensors",
211
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.bias": "model-00001-of-00002.safetensors",
212
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.mlp.fc2.weight": "model-00001-of-00002.safetensors",
213
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
214
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
215
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
216
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
217
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
218
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
219
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
220
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
221
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.bias": "model-00001-of-00002.safetensors",
222
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm1.weight": "model-00001-of-00002.safetensors",
223
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.bias": "model-00001-of-00002.safetensors",
224
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.layer_norm2.weight": "model-00001-of-00002.safetensors",
225
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.bias": "model-00001-of-00002.safetensors",
226
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc1.weight": "model-00001-of-00002.safetensors",
227
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.bias": "model-00001-of-00002.safetensors",
228
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.mlp.fc2.weight": "model-00001-of-00002.safetensors",
229
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
230
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
231
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
232
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
233
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
234
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
235
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
236
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
237
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.bias": "model-00001-of-00002.safetensors",
238
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm1.weight": "model-00001-of-00002.safetensors",
239
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.bias": "model-00001-of-00002.safetensors",
240
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.layer_norm2.weight": "model-00001-of-00002.safetensors",
241
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.bias": "model-00001-of-00002.safetensors",
242
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc1.weight": "model-00001-of-00002.safetensors",
243
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.bias": "model-00001-of-00002.safetensors",
244
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.mlp.fc2.weight": "model-00001-of-00002.safetensors",
245
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
246
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
247
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
248
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
249
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
250
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
251
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
252
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
253
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.bias": "model-00001-of-00002.safetensors",
254
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm1.weight": "model-00001-of-00002.safetensors",
255
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.bias": "model-00001-of-00002.safetensors",
256
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.layer_norm2.weight": "model-00001-of-00002.safetensors",
257
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.bias": "model-00001-of-00002.safetensors",
258
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc1.weight": "model-00001-of-00002.safetensors",
259
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.bias": "model-00001-of-00002.safetensors",
260
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.mlp.fc2.weight": "model-00001-of-00002.safetensors",
261
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
262
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
263
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
264
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
265
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
266
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
267
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
268
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
269
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.bias": "model-00001-of-00002.safetensors",
270
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm1.weight": "model-00001-of-00002.safetensors",
271
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.bias": "model-00001-of-00002.safetensors",
272
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.layer_norm2.weight": "model-00001-of-00002.safetensors",
273
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.bias": "model-00001-of-00002.safetensors",
274
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc1.weight": "model-00001-of-00002.safetensors",
275
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.bias": "model-00001-of-00002.safetensors",
276
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.mlp.fc2.weight": "model-00001-of-00002.safetensors",
277
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
278
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
279
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
280
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
281
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
282
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
283
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
284
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
285
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.bias": "model-00001-of-00002.safetensors",
286
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm1.weight": "model-00001-of-00002.safetensors",
287
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.bias": "model-00001-of-00002.safetensors",
288
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.layer_norm2.weight": "model-00001-of-00002.safetensors",
289
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.bias": "model-00001-of-00002.safetensors",
290
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc1.weight": "model-00001-of-00002.safetensors",
291
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.bias": "model-00001-of-00002.safetensors",
292
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.mlp.fc2.weight": "model-00001-of-00002.safetensors",
293
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
294
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
295
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
296
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
297
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
298
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
299
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
300
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
301
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.bias": "model-00001-of-00002.safetensors",
302
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm1.weight": "model-00001-of-00002.safetensors",
303
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.bias": "model-00001-of-00002.safetensors",
304
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.layer_norm2.weight": "model-00001-of-00002.safetensors",
305
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.bias": "model-00001-of-00002.safetensors",
306
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc1.weight": "model-00001-of-00002.safetensors",
307
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.bias": "model-00001-of-00002.safetensors",
308
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.mlp.fc2.weight": "model-00001-of-00002.safetensors",
309
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
310
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
311
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
312
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
313
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
314
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
315
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
316
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
317
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.bias": "model-00001-of-00002.safetensors",
318
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm1.weight": "model-00001-of-00002.safetensors",
319
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.bias": "model-00001-of-00002.safetensors",
320
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.layer_norm2.weight": "model-00001-of-00002.safetensors",
321
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.bias": "model-00001-of-00002.safetensors",
322
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc1.weight": "model-00001-of-00002.safetensors",
323
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.bias": "model-00001-of-00002.safetensors",
324
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.mlp.fc2.weight": "model-00001-of-00002.safetensors",
325
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
326
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
327
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
328
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
329
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
330
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
331
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
332
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
333
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.bias": "model-00001-of-00002.safetensors",
334
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm1.weight": "model-00001-of-00002.safetensors",
335
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.bias": "model-00001-of-00002.safetensors",
336
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.layer_norm2.weight": "model-00001-of-00002.safetensors",
337
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.bias": "model-00001-of-00002.safetensors",
338
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc1.weight": "model-00001-of-00002.safetensors",
339
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.bias": "model-00001-of-00002.safetensors",
340
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.mlp.fc2.weight": "model-00001-of-00002.safetensors",
341
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
342
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
343
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
344
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
345
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
346
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
347
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
348
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
349
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.bias": "model-00001-of-00002.safetensors",
350
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm1.weight": "model-00001-of-00002.safetensors",
351
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.bias": "model-00001-of-00002.safetensors",
352
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.layer_norm2.weight": "model-00001-of-00002.safetensors",
353
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.bias": "model-00001-of-00002.safetensors",
354
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc1.weight": "model-00001-of-00002.safetensors",
355
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.bias": "model-00001-of-00002.safetensors",
356
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.mlp.fc2.weight": "model-00001-of-00002.safetensors",
357
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
358
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
359
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
360
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
361
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
362
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
363
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
364
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
365
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.bias": "model-00001-of-00002.safetensors",
366
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm1.weight": "model-00001-of-00002.safetensors",
367
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.bias": "model-00001-of-00002.safetensors",
368
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.layer_norm2.weight": "model-00001-of-00002.safetensors",
369
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.bias": "model-00001-of-00002.safetensors",
370
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc1.weight": "model-00001-of-00002.safetensors",
371
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.bias": "model-00001-of-00002.safetensors",
372
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.mlp.fc2.weight": "model-00001-of-00002.safetensors",
373
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
374
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
375
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
376
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
377
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
378
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
379
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
380
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
381
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.bias": "model-00001-of-00002.safetensors",
382
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm1.weight": "model-00001-of-00002.safetensors",
383
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.bias": "model-00001-of-00002.safetensors",
384
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.layer_norm2.weight": "model-00001-of-00002.safetensors",
385
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.bias": "model-00001-of-00002.safetensors",
386
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc1.weight": "model-00001-of-00002.safetensors",
387
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.bias": "model-00001-of-00002.safetensors",
388
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.mlp.fc2.weight": "model-00001-of-00002.safetensors",
389
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
390
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
391
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
392
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
393
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
394
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
395
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
396
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
397
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.bias": "model-00001-of-00002.safetensors",
398
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm1.weight": "model-00001-of-00002.safetensors",
399
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.bias": "model-00001-of-00002.safetensors",
400
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.layer_norm2.weight": "model-00001-of-00002.safetensors",
401
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.bias": "model-00001-of-00002.safetensors",
402
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc1.weight": "model-00001-of-00002.safetensors",
403
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.bias": "model-00001-of-00002.safetensors",
404
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.mlp.fc2.weight": "model-00001-of-00002.safetensors",
405
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
406
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
407
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
408
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
409
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
410
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
411
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
412
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
413
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.bias": "model-00001-of-00002.safetensors",
414
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm1.weight": "model-00001-of-00002.safetensors",
415
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.bias": "model-00001-of-00002.safetensors",
416
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.layer_norm2.weight": "model-00001-of-00002.safetensors",
417
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.bias": "model-00001-of-00002.safetensors",
418
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc1.weight": "model-00001-of-00002.safetensors",
419
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.bias": "model-00001-of-00002.safetensors",
420
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.mlp.fc2.weight": "model-00001-of-00002.safetensors",
421
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
422
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
423
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
424
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
425
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
426
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
427
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
428
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
429
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.bias": "model-00001-of-00002.safetensors",
430
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm1.weight": "model-00001-of-00002.safetensors",
431
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.bias": "model-00001-of-00002.safetensors",
432
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.layer_norm2.weight": "model-00001-of-00002.safetensors",
433
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.bias": "model-00001-of-00002.safetensors",
434
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc1.weight": "model-00001-of-00002.safetensors",
435
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.bias": "model-00001-of-00002.safetensors",
436
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.mlp.fc2.weight": "model-00001-of-00002.safetensors",
437
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
438
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
439
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
440
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
441
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
442
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
443
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
444
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
445
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.bias": "model-00001-of-00002.safetensors",
446
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm1.weight": "model-00001-of-00002.safetensors",
447
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.bias": "model-00001-of-00002.safetensors",
448
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.layer_norm2.weight": "model-00001-of-00002.safetensors",
449
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.bias": "model-00001-of-00002.safetensors",
450
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc1.weight": "model-00001-of-00002.safetensors",
451
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.bias": "model-00001-of-00002.safetensors",
452
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.mlp.fc2.weight": "model-00001-of-00002.safetensors",
453
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
454
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
455
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
456
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
457
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
458
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
459
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
460
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
461
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.bias": "model-00001-of-00002.safetensors",
462
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm1.weight": "model-00001-of-00002.safetensors",
463
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.bias": "model-00001-of-00002.safetensors",
464
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.layer_norm2.weight": "model-00001-of-00002.safetensors",
465
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.bias": "model-00001-of-00002.safetensors",
466
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc1.weight": "model-00001-of-00002.safetensors",
467
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.bias": "model-00001-of-00002.safetensors",
468
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.mlp.fc2.weight": "model-00001-of-00002.safetensors",
469
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
470
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
471
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
472
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
473
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
474
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
475
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
476
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
477
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.bias": "model-00001-of-00002.safetensors",
478
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm1.weight": "model-00001-of-00002.safetensors",
479
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.bias": "model-00001-of-00002.safetensors",
480
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.layer_norm2.weight": "model-00001-of-00002.safetensors",
481
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.bias": "model-00001-of-00002.safetensors",
482
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc1.weight": "model-00001-of-00002.safetensors",
483
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.bias": "model-00001-of-00002.safetensors",
484
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.mlp.fc2.weight": "model-00001-of-00002.safetensors",
485
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
486
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
487
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
488
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
489
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
490
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
491
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
492
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
493
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.bias": "model-00001-of-00002.safetensors",
494
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm1.weight": "model-00001-of-00002.safetensors",
495
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.bias": "model-00001-of-00002.safetensors",
496
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.layer_norm2.weight": "model-00001-of-00002.safetensors",
497
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.bias": "model-00001-of-00002.safetensors",
498
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc1.weight": "model-00001-of-00002.safetensors",
499
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.bias": "model-00001-of-00002.safetensors",
500
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.mlp.fc2.weight": "model-00001-of-00002.safetensors",
501
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
502
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
503
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
504
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
505
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
506
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
507
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
508
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
509
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.bias": "model-00001-of-00002.safetensors",
510
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm1.weight": "model-00001-of-00002.safetensors",
511
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.bias": "model-00001-of-00002.safetensors",
512
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.layer_norm2.weight": "model-00001-of-00002.safetensors",
513
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.bias": "model-00001-of-00002.safetensors",
514
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc1.weight": "model-00001-of-00002.safetensors",
515
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.bias": "model-00001-of-00002.safetensors",
516
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.mlp.fc2.weight": "model-00001-of-00002.safetensors",
517
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
518
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
519
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
520
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
521
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
522
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
523
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
524
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
525
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.bias": "model-00001-of-00002.safetensors",
526
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm1.weight": "model-00001-of-00002.safetensors",
527
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.bias": "model-00001-of-00002.safetensors",
528
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.layer_norm2.weight": "model-00001-of-00002.safetensors",
529
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.bias": "model-00001-of-00002.safetensors",
530
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc1.weight": "model-00001-of-00002.safetensors",
531
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.bias": "model-00001-of-00002.safetensors",
532
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.mlp.fc2.weight": "model-00001-of-00002.safetensors",
533
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
534
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
535
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
536
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
537
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
538
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
539
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
540
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
541
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.bias": "model-00001-of-00002.safetensors",
542
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm1.weight": "model-00001-of-00002.safetensors",
543
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.bias": "model-00001-of-00002.safetensors",
544
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.layer_norm2.weight": "model-00001-of-00002.safetensors",
545
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.bias": "model-00001-of-00002.safetensors",
546
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc1.weight": "model-00001-of-00002.safetensors",
547
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.bias": "model-00001-of-00002.safetensors",
548
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.mlp.fc2.weight": "model-00001-of-00002.safetensors",
549
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
550
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
551
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
552
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
553
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
554
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
555
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
556
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
557
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.bias": "model-00001-of-00002.safetensors",
558
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm1.weight": "model-00001-of-00002.safetensors",
559
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.bias": "model-00001-of-00002.safetensors",
560
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.layer_norm2.weight": "model-00001-of-00002.safetensors",
561
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.bias": "model-00001-of-00002.safetensors",
562
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc1.weight": "model-00001-of-00002.safetensors",
563
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.bias": "model-00001-of-00002.safetensors",
564
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.mlp.fc2.weight": "model-00001-of-00002.safetensors",
565
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
566
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
567
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
568
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
569
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
570
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
571
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
572
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
573
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.bias": "model-00001-of-00002.safetensors",
574
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm1.weight": "model-00001-of-00002.safetensors",
575
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.bias": "model-00001-of-00002.safetensors",
576
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.layer_norm2.weight": "model-00001-of-00002.safetensors",
577
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.bias": "model-00001-of-00002.safetensors",
578
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc1.weight": "model-00001-of-00002.safetensors",
579
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.bias": "model-00001-of-00002.safetensors",
580
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.mlp.fc2.weight": "model-00001-of-00002.safetensors",
581
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
582
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
583
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
584
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
585
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
586
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
587
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
588
+ "model.vision_embed_tokens.img_processor.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
589
+ "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.bias": "model-00001-of-00002.safetensors",
590
+ "model.vision_embed_tokens.img_processor.vision_model.post_layernorm.weight": "model-00001-of-00002.safetensors",
591
+ "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.bias": "model-00001-of-00002.safetensors",
592
+ "model.vision_embed_tokens.img_processor.vision_model.pre_layrnorm.weight": "model-00001-of-00002.safetensors",
593
+ "model.vision_embed_tokens.img_projection.0.bias": "model-00001-of-00002.safetensors",
594
+ "model.vision_embed_tokens.img_projection.0.weight": "model-00001-of-00002.safetensors",
595
+ "model.vision_embed_tokens.img_projection.2.bias": "model-00001-of-00002.safetensors",
596
+ "model.vision_embed_tokens.img_projection.2.weight": "model-00001-of-00002.safetensors",
597
+ "model.vision_embed_tokens.sub_GN": "model-00001-of-00002.safetensors"
598
+ }
599
+ }
modeling_phi3_v.py ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ import glob
3
+ import json
4
+ from pathlib import Path
5
+ from typing import Dict, Optional, List, Tuple, Union
6
+
7
+ import math
8
+ import warnings
9
+ import mlx.core as mx
10
+ import mlx.nn as nn
11
+
12
+ import logging
13
+ # from llms.mlx_lm.models.base import BaseModelArgs
14
+ from configuration_phi3_v import Phi3VConfig
15
+ from utils import BaseModelOutputWithPast, FloatTensor, LongTensor, Cache, DynamicCache, CausalLMOutputWithPast
16
+ from image_embedding_phi3_v import Phi3ImageEmbedding
17
+ from attn_mask import _prepare_4d_causal_attention_mask
18
+ from huggingface_hub import snapshot_download
19
+
20
+ class Phi3RotaryEmbedding(nn.Module):
21
+ def __init__(self, dim, max_position_embeddings=2048, base=10000):
22
+ super().__init__()
23
+ self.dim = dim
24
+ self.max_position_embeddings = max_position_embeddings
25
+ self.base = base
26
+
27
+ def __call__(self, x, position_ids, seq_len=None):
28
+ if self.inv_freq is None:
29
+ self.inv_freq = 1.0 / (
30
+ self.base ** (mx.arange(0, self.dim, 2, Dtype=mx.int64, device=x.device).float() / self.dim)
31
+ )
32
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
33
+ position_ids_expanded = position_ids[:, None, :].float()
34
+
35
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
36
+ emb = mx.concatenate((freqs, freqs), dim=-1)
37
+ cos = emb.cos()
38
+ sin = emb.sin()
39
+ return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
40
+
41
+ class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
42
+ def __init__(self, dim, config):
43
+ super().__init__(dim, config.max_position_embeddings, config.rope_theta)
44
+ self.short_factor = config.rope_scaling["short_factor"]
45
+ self.long_factor = config.rope_scaling["long_factor"]
46
+ self.original_max_position_embeddings = config.original_max_position_embeddings
47
+
48
+ def __call__(self, x, position_ids, seq_len=None):
49
+ seq_len = mx.max(position_ids) + 1
50
+ if seq_len > self.original_max_position_embeddings:
51
+ ext_factors = mx.array(self.long_factor, Dtype=mx.float32)
52
+ else:
53
+ ext_factors = mx.array(self.short_factor, Dtype=mx.float32)
54
+
55
+ inv_freq_shape = mx.arange(0, self.dim, 2, Dtype=mx.int64).float() / self.dim
56
+ self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
57
+
58
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
59
+ position_ids_expanded = position_ids[:, None, :].float()
60
+
61
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
62
+ emb = mx.concatenate((freqs, freqs), dim=-1)
63
+
64
+ scale = self.max_position_embeddings / self.original_max_position_embeddings
65
+ if scale <= 1.0:
66
+ scaling_factor = 1.0
67
+ else:
68
+ scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
69
+
70
+ cos = emb.cos() * scaling_factor
71
+ sin = emb.sin() * scaling_factor
72
+ return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
73
+
74
+ class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
75
+ def __init__(self, dim, config):
76
+ super().__init__(dim, config.max_position_embeddings, config.rope_theta)
77
+ self.short_factor = config.rope_scaling["short_factor"]
78
+ self.long_factor = config.rope_scaling["long_factor"]
79
+ self.original_max_position_embeddings = config.original_max_position_embeddings
80
+
81
+ def __call__(self, x, position_ids, seq_len=None):
82
+ seq_len = mx.max(position_ids) + 1
83
+ if seq_len > self.original_max_position_embeddings:
84
+ ext_factors = mx.array(self.long_factor, Dtype=mx.float32)
85
+ else:
86
+ ext_factors = mx.array(self.short_factor, Dtype=mx.float32)
87
+
88
+ inv_freq_shape = mx.arange(0, self.dim, 2, Dtype=mx.int64).float() / self.dim
89
+ self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
90
+
91
+ inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
92
+ position_ids_expanded = position_ids[:, None, :].float()
93
+
94
+ device_type = x.device.type
95
+ device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
96
+ freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
97
+ emb = mx.concatenate((freqs, freqs), dim=-1)
98
+
99
+ scale = self.max_position_embeddings / self.original_max_position_embeddings
100
+ if scale <= 1.0:
101
+ scaling_factor = 1.0
102
+ else:
103
+ scaling_factor = 0.1 * math.log(scale) + 1.0
104
+
105
+ cos = emb.cos() * scaling_factor
106
+ sin = emb.sin() * scaling_factor
107
+ return cos.to(Dtype=x.Dtype), sin.to(Dtype=x.Dtype)
108
+
109
+ def rotate_half(x):
110
+ x1 = x[..., : x.shape[-1] // 2]
111
+ x2 = x[..., x.shape[-1] // 2 :]
112
+ return mx.concatenate((-x2, x1), dim=-1)
113
+
114
+ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
115
+ cos = cos.unsqueeze(unsqueeze_dim)
116
+ sin = sin.unsqueeze(unsqueeze_dim)
117
+ q_embed = (q * cos) + (rotate_half(q) * sin)
118
+ k_embed = (k * cos) + (rotate_half(k) * sin)
119
+ return q_embed, k_embed
120
+
121
+ class Phi3MLP(nn.Module):
122
+ def __init__(self, config: Phi3VConfig):
123
+ super().__init__()
124
+ self.config = config
125
+
126
+ self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
127
+ self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
128
+
129
+ def __call__(self, x) -> mx.array:
130
+ x = self.gate_up_proj(x)
131
+ gate, x = mx.split(x, 2, axis=-1)
132
+ return self.down_proj(nn.silu(gate) * x)
133
+
134
+ def repeat_kv(hidden_states: mx.array, n_rep: int) -> mx.array:
135
+ batch, num_key_value_heads, slen, head_dim = hidden_states.shape
136
+ if n_rep == 1:
137
+ return hidden_states
138
+ hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
139
+ return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
140
+
141
+ class Phi3Attention(nn.Module):
142
+ def __init__(self, config: Phi3VConfig, layer_idx: Optional[int] = None):
143
+ super().__init__()
144
+ self.config = config
145
+ self.layer_idx = layer_idx
146
+ if layer_idx is None:
147
+ logging.warning(
148
+ "Instantiating %s without passing a `layer_idx` is not recommended and will "
149
+ "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
150
+ "when creating this class.",
151
+ self.__class__.__name__,
152
+ )
153
+
154
+ self.attention_dropout = config.attention_dropout
155
+ self.hidden_size = config.hidden_size
156
+ self.num_heads = config.num_attention_heads
157
+ self.head_dim = self.hidden_size // self.num_heads
158
+ self.num_key_value_heads = config.num_key_value_heads
159
+ self.num_key_value_groups = self.num_heads // self.num_key_value_heads
160
+ self.max_position_embeddings = config.max_position_embeddings
161
+ self.original_max_position_embeddings = config.original_max_position_embeddings
162
+ self.rope_theta = config.rope_theta
163
+ self.rope_scaling = config.rope_scaling
164
+ self.is_causal = True
165
+
166
+ if (self.head_dim * self.num_heads) != self.hidden_size:
167
+ raise ValueError(
168
+ f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
169
+ f" and `num_heads`: {self.num_heads})."
170
+ )
171
+
172
+ op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
173
+ self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
174
+ self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
175
+ self._init_rope()
176
+
177
+ def _init_rope(self):
178
+ if self.rope_scaling is None:
179
+ self.rotary_emb = Phi3RotaryEmbedding(
180
+ self.head_dim,
181
+ max_position_embeddings=self.max_position_embeddings,
182
+ base=self.rope_theta,
183
+ )
184
+ else:
185
+ scaling_type = self.config.rope_scaling["type"]
186
+ if scaling_type == "su":
187
+ self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
188
+ elif scaling_type == "yarn":
189
+ self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
190
+ else:
191
+ raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
192
+
193
+ def __call__(
194
+ self,
195
+ hidden_states: mx.array,
196
+ attention_mask: Optional[mx.array] = None,
197
+ position_ids: Optional[LongTensor] = None,
198
+ past_key_value: Optional[Tuple[mx.array, mx.array]] = None,
199
+ output_attentions: bool = False,
200
+ use_cache: bool = False,
201
+ ) -> Tuple[mx.array, Optional[mx.array], Optional[Tuple[mx.array]]]:
202
+ logging.warning("You are not running the flash-attention implementation, expect numerical differences.")
203
+
204
+ bsz, q_len, _ = hidden_states.size()
205
+
206
+ qkv = self.qkv_proj(hidden_states)
207
+ query_pos = self.num_heads * self.head_dim
208
+ query_states = qkv[..., :query_pos]
209
+ key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
210
+ value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
211
+
212
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
213
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
214
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
215
+
216
+ kv_seq_len = key_states.shape[-2]
217
+ if past_key_value is not None:
218
+ if self.layer_idx is None:
219
+ raise ValueError(
220
+ f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
221
+ "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
222
+ "with a layer index."
223
+ )
224
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
225
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
226
+
227
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
228
+
229
+ if past_key_value is not None:
230
+ cache_kwargs = {"sin": sin, "cos": cos}
231
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
232
+
233
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
234
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
235
+
236
+ attn_weights = mx.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
237
+ if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
238
+ raise ValueError(
239
+ f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
240
+ f" {attn_weights.size()}"
241
+ )
242
+
243
+ if attention_mask is not None:
244
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
245
+ raise ValueError(
246
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
247
+ )
248
+ attn_weights = attn_weights + attention_mask
249
+
250
+ attn_weights = mx.softmax(attn_weights, dim=-1, Dtype=mx.float32).to(value_states.Dtype)
251
+ attn_weights = mx.Dropout(attn_weights, p=self.attention_dropout, training=self.training)
252
+
253
+ attn_output = mx.matmul(attn_weights, value_states)
254
+
255
+ if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
256
+ raise ValueError(
257
+ f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
258
+ f" {attn_output.size()}"
259
+ )
260
+
261
+ attn_output = attn_output.transpose(1, 2).contiguous()
262
+ attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
263
+
264
+ attn_output = self.o_proj(attn_output)
265
+
266
+ if not output_attentions:
267
+ attn_weights = None
268
+
269
+ return attn_output, attn_weights, past_key_value
270
+
271
+ class Phi3SdpaAttention(Phi3Attention):
272
+ def __call__(
273
+ self,
274
+ hidden_states: mx.array,
275
+ attention_mask: Optional[mx.array] = None,
276
+ position_ids: Optional[LongTensor] = None,
277
+ past_key_value: Optional[Tuple[mx.array, mx.array]] = None,
278
+ output_attentions: bool = False,
279
+ use_cache: bool = False,
280
+ ) -> Tuple[mx.array, Optional[mx.array], Optional[Tuple[mx.array]]]:
281
+ if output_attentions:
282
+ logging.warning(
283
+ "Phi3Model is using Phi3SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
284
+ 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
285
+ )
286
+ return super().__call__(
287
+ hidden_states=hidden_states,
288
+ attention_mask=attention_mask,
289
+ position_ids=position_ids,
290
+ past_key_value=past_key_value,
291
+ output_attentions=output_attentions,
292
+ use_cache=use_cache,
293
+ )
294
+
295
+ bsz, q_len, _ = hidden_states.size()
296
+
297
+ qkv = self.qkv_proj(hidden_states)
298
+ query_pos = self.num_heads * self.head_dim
299
+ query_states = qkv[..., :query_pos]
300
+ key_states = qkv[..., query_pos : query_pos + self.num_key_value_heads * self.head_dim]
301
+ value_states = qkv[..., query_pos + self.num_key_value_heads * self.head_dim :]
302
+
303
+ query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
304
+ key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
305
+ value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
306
+
307
+ kv_seq_len = key_states.shape[-2]
308
+ if past_key_value is not None:
309
+ kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
310
+ cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
311
+
312
+ query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
313
+
314
+ if past_key_value is not None:
315
+ cache_kwargs = {"sin": sin, "cos": cos}
316
+ key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
317
+
318
+ key_states = repeat_kv(key_states, self.num_key_value_groups)
319
+ value_states = repeat_kv(value_states, self.num_key_value_groups)
320
+
321
+ if attention_mask is not None:
322
+ if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
323
+ raise ValueError(
324
+ f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
325
+ )
326
+
327
+ if query_states.device.type == "cuda" and attention_mask is not None:
328
+ query_states = query_states.contiguous()
329
+ key_states = key_states.contiguous()
330
+ value_states = value_states.contiguous()
331
+
332
+ attn_output = mx.fast.scaled_dot_product_attention(
333
+ query_states,
334
+ key_states,
335
+ value_states,
336
+ attn_mask=attention_mask,
337
+ dropout_p=self.attention_dropout if self.training else 0.0,
338
+ is_causal=self.is_causal and attention_mask is None and q_len > 1,
339
+ )
340
+
341
+ attn_output = attn_output.transpose(1, 2).contiguous()
342
+ attn_output = attn_output.view(bsz, q_len, self.hidden_size)
343
+
344
+ attn_output = self.o_proj(attn_output)
345
+
346
+ return attn_output, None, past_key_value
347
+
348
+ PHI3_ATTENTION_CLASSES = {
349
+ "eager": Phi3Attention,
350
+ "sdpa": Phi3SdpaAttention,
351
+ }
352
+
353
+ class Phi3DecoderLayer(nn.Module):
354
+ def __init__(self, config: Phi3VConfig, layer_idx: int):
355
+ super().__init__()
356
+ self.config = config
357
+ self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
358
+
359
+ self.mlp = Phi3MLP(config)
360
+ self.input_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
361
+
362
+ self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
363
+ self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
364
+ self.post_attention_layernorm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
365
+
366
+ def __call__(
367
+ self,
368
+ hidden_states: mx.array,
369
+ attention_mask: Optional[mx.array] = None,
370
+ position_ids: Optional[LongTensor] = None,
371
+ past_key_value: Optional[Tuple[mx.array]] = None,
372
+ output_attentions: bool = False,
373
+ use_cache: bool = False,
374
+ **kwargs,
375
+ ) -> Tuple[mx.array, Optional[Tuple[FloatTensor, FloatTensor]]]:
376
+ if "padding_mask" in kwargs:
377
+ warnings.warn(
378
+ "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
379
+ )
380
+ residual = hidden_states
381
+
382
+ hidden_states = self.input_layernorm(hidden_states)
383
+
384
+ attn_outputs, self_attn_weights, present_key_value = self.self_attn(
385
+ hidden_states=hidden_states,
386
+ attention_mask=attention_mask,
387
+ position_ids=position_ids,
388
+ past_key_value=past_key_value,
389
+ output_attentions=output_attentions,
390
+ use_cache=use_cache,
391
+ )
392
+
393
+ hidden_states = residual + self.resid_attn_dropout(attn_outputs)
394
+
395
+ residual = hidden_states
396
+ hidden_states = self.post_attention_layernorm(hidden_states)
397
+ hidden_states = self.mlp(hidden_states)
398
+ hidden_states = residual + self.resid_mlp_dropout(hidden_states)
399
+
400
+ outputs = (hidden_states,)
401
+
402
+ if output_attentions:
403
+ outputs += (self_attn_weights,)
404
+
405
+ if use_cache:
406
+ outputs += (present_key_value,)
407
+
408
+ return outputs
409
+
410
+ class Phi3VPreTrainedModel(nn.Module):
411
+ config_class = Phi3VConfig
412
+ base_model_prefix = "model"
413
+ supports_gradient_checkpointing = True
414
+ _no_split_modules = ["Phi3DecoderLayer"]
415
+ _skip_keys_device_placement = "past_key_values"
416
+ _supports_flash_attn_2 = False
417
+ _supports_sdpa = True
418
+ _supports_cache_class = True
419
+ _version = "0.0.5"
420
+
421
+ def __init__(self, config):
422
+ super(Phi3VPreTrainedModel, self).__init__()
423
+ self.config = config
424
+
425
+ def _init_weights(self, module):
426
+ std = self.config.initializer_range
427
+ if isinstance(module, nn.Linear):
428
+ module.weight.data.normal_(mean=0.0, std=std)
429
+ if module.bias is not None:
430
+ module.bias.data.zero_()
431
+ elif isinstance(module, nn.Embedding):
432
+ module.weight.data.normal_(mean=0.0, std=std)
433
+ if module.padding_idx is not None:
434
+ module.weight.data[module.padding_idx].zero_()
435
+ class Phi3VModel(Phi3VPreTrainedModel):
436
+ def __init__(self, config: Phi3VConfig):
437
+ super(Phi3VModel, self).__init__(config)
438
+ self.padding_idx = config.pad_token_id
439
+ self.vocab_size = config.vocab_size
440
+
441
+ self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
442
+ self.embed_dropout = nn.Dropout(config.embd_pdrop)
443
+
444
+ # Vision embedding integration
445
+ if isinstance(config.embd_layer, dict) and config.embd_layer.get('embedding_cls') == 'image':
446
+ self.vision_embed_tokens = Phi3ImageEmbedding(config)
447
+ else:
448
+ self.vision_embed_tokens = None
449
+
450
+ self.layers = [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
451
+ self.norm = nn.RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
452
+
453
+ self.gradient_checkpointing = False
454
+ # Initialize weights and apply final processing
455
+ self.apply(self._init_weights)
456
+
457
+ def get_input_embeddings(self):
458
+ return self.embed_tokens
459
+
460
+ def set_input_embeddings(self, value):
461
+ self.embed_tokens = value
462
+
463
+ def __call__(
464
+ self,
465
+ input_ids: LongTensor = None,
466
+ attention_mask: Optional[mx.array] = None,
467
+ position_ids: Optional[LongTensor] = None,
468
+ past_key_values: Optional[List[FloatTensor]] = None,
469
+ inputs_embeds: Optional[FloatTensor] = None,
470
+ pixel_values: Optional[FloatTensor] = None,
471
+ image_sizes: Optional[LongTensor] = None,
472
+ use_cache: Optional[bool] = None,
473
+ output_attentions: Optional[bool] = None,
474
+ output_hidden_states: Optional[bool] = None,
475
+ return_dict: Optional[bool] = None,
476
+ ) -> Union[Tuple, BaseModelOutputWithPast]:
477
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
478
+ output_hidden_states = (
479
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
480
+ )
481
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
482
+
483
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
484
+
485
+ # retrieve input_ids and inputs_embeds
486
+ if input_ids is not None and inputs_embeds is not None:
487
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
488
+ elif input_ids is not None:
489
+ batch_size, seq_length = input_ids.shape[:2]
490
+ elif inputs_embeds is not None:
491
+ batch_size, seq_length = inputs_embeds.shape[:2]
492
+ else:
493
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
494
+
495
+ past_key_values_length = 0
496
+
497
+ if self.gradient_checkpointing and self.training:
498
+ if use_cache:
499
+ logging.warning(
500
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
501
+ )
502
+ use_cache = False
503
+
504
+ if use_cache:
505
+ use_legacy_cache = not isinstance(past_key_values, Cache)
506
+ if use_legacy_cache:
507
+ past_key_values = DynamicCache.from_legacy_cache(past_key_values)
508
+ past_key_values_length = past_key_values.get_usable_length(seq_length)
509
+
510
+ if position_ids is None:
511
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
512
+ position_ids = mx.arange(
513
+ past_key_values_length, seq_length + past_key_values_length, Dtype=mx.long, device=device
514
+ )
515
+ position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
516
+ else:
517
+ position_ids = position_ids.view(-1, seq_length).long()
518
+
519
+ if inputs_embeds is None:
520
+ if pixel_values is not None and image_sizes is not None:
521
+ assert self.vision_embed_tokens is not None, "Vision embedding layer is not defined"
522
+ inputs_embeds = self.vision_embed_tokens(input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
523
+ else:
524
+ inputs_embeds = self.embed_tokens(input_ids)
525
+
526
+ if attention_mask is not None and self._attn_implementation == "flash_attention_2" and use_cache:
527
+ is_padding_right = attention_mask[:, -1].sum().item() != batch_size
528
+ if is_padding_right:
529
+ raise ValueError(
530
+ "You are attempting to perform batched generation with padding_side='right'"
531
+ " this may lead to unexpected behaviour for Flash Attention version of Phi3. Make sure to "
532
+ " call `tokenizer.padding_side = 'left'` before tokenizing the input. "
533
+ )
534
+
535
+ if self._attn_implementation == "flash_attention_2":
536
+ # 2d mask is passed through the layers
537
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
538
+ else:
539
+ # 4d mask is passed through the layers
540
+ attention_mask = _prepare_4d_causal_attention_mask(
541
+ attention_mask,
542
+ (batch_size, seq_length),
543
+ inputs_embeds,
544
+ past_key_values_length,
545
+ sliding_window=self.config.sliding_window,
546
+ )
547
+
548
+ hidden_states = inputs_embeds
549
+
550
+ # decoder layers
551
+ all_hidden_states = () if output_hidden_states else None
552
+ all_self_attns = () if output_attentions else None
553
+ next_decoder_cache = None
554
+
555
+ for decoder_layer in self.layers:
556
+ if output_hidden_states:
557
+ all_hidden_states += (hidden_states,)
558
+
559
+ if self.gradient_checkpointing and self.training:
560
+ layer_outputs = self._gradient_checkpointing_func(
561
+ decoder_layer.__call__,
562
+ hidden_states,
563
+ attention_mask,
564
+ position_ids,
565
+ past_key_values,
566
+ output_attentions,
567
+ use_cache,
568
+ )
569
+ else:
570
+ layer_outputs = decoder_layer(
571
+ hidden_states,
572
+ attention_mask=attention_mask,
573
+ position_ids=position_ids,
574
+ past_key_value=past_key_values,
575
+ output_attentions=output_attentions,
576
+ use_cache=use_cache,
577
+ )
578
+
579
+ hidden_states = layer_outputs[0]
580
+
581
+ if use_cache:
582
+ next_decoder_cache = layer_outputs[2 if output_attentions else 1]
583
+
584
+ if output_attentions:
585
+ all_self_attns += (layer_outputs[1],)
586
+
587
+ hidden_states = self.norm(hidden_states)
588
+
589
+ # add hidden states from the last decoder layer
590
+ if output_hidden_states:
591
+ all_hidden_states += (hidden_states,)
592
+
593
+ next_cache = None
594
+ if use_cache:
595
+ next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
596
+ if not return_dict:
597
+ return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
598
+ return BaseModelOutputWithPast(
599
+ last_hidden_state=hidden_states,
600
+ past_key_values=next_cache,
601
+ hidden_states=all_hidden_states,
602
+ attentions=all_self_attns,
603
+ )
604
+
605
+ @staticmethod
606
+ def from_pretrained(path_or_hf_repo: str):
607
+ path = Path(path_or_hf_repo)
608
+ if not path.exists():
609
+ path = Path(
610
+ snapshot_download(
611
+ repo_id=path_or_hf_repo,
612
+ allow_patterns=[
613
+ "*.json",
614
+ "*.safetensors",
615
+ "*.py",
616
+ "tokenizer.model",
617
+ "*.tiktoken",
618
+ ],
619
+ )
620
+ )
621
+
622
+ with open(path / "config.json", "r") as f:
623
+ model_config = json.load(f)
624
+
625
+ model = Phi3VModel(Phi3VConfig.from_dict(model_config))
626
+
627
+ weight_files = list(glob.glob(f"{path}/*.safetensors"))
628
+ assert len(weight_files) > 0, f"No safetensors weight files found: {weight_files}"
629
+
630
+ # Load weights from all files
631
+ weights = {}
632
+ for wf in weight_files:
633
+ weights.update(mx.load(wf))
634
+
635
+ # Ensure all weights are converted to lists if necessary
636
+ for k, v in weights.items():
637
+ if hasattr(v, 'tolist'):
638
+ weights[k] = v.tolist()
639
+
640
+ # Load weights
641
+ model.load_weights(list(weights.items()))
642
+ return model
643
+
644
+
645
+ class Phi3VForCausalLM(Phi3VPreTrainedModel):
646
+ _tied_weights_keys = ["lm_head.weight"]
647
+
648
+ def __init__(self, config):
649
+ super().__init__(config)
650
+ self.model = Phi3VModel(config)
651
+ self.vocab_size = config.vocab_size
652
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
653
+ self.post_init()
654
+
655
+ def get_input_embeddings(self):
656
+ return self.model.embed_tokens
657
+
658
+ def set_input_embeddings(self, value):
659
+ self.model.embed_tokens = value
660
+
661
+ def get_output_embeddings(self):
662
+ return self.lm_head
663
+
664
+ def set_output_embeddings(self, new_embeddings):
665
+ self.lm_head = new_embeddings
666
+
667
+ def set_decoder(self, decoder):
668
+ self.model = decoder
669
+
670
+ def get_decoder(self):
671
+ return self.model
672
+
673
+ def __call__(
674
+ self,
675
+ input_ids: LongTensor = None,
676
+ attention_mask: Optional[mx.array] = None,
677
+ position_ids: Optional[LongTensor] = None,
678
+ past_key_values: Optional[List[FloatTensor]] = None,
679
+ inputs_embeds: Optional[FloatTensor] = None,
680
+ pixel_values: Optional[FloatTensor] = None,
681
+ image_sizes: Optional[LongTensor] = None,
682
+ use_cache: Optional[bool] = None,
683
+ output_attentions: Optional[bool] = None,
684
+ output_hidden_states: Optional[bool] = None,
685
+ return_dict: Optional[bool] = None,
686
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
687
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
688
+ output_hidden_states = (
689
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
690
+ )
691
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
692
+
693
+ outputs = self.model(
694
+ input_ids=input_ids,
695
+ attention_mask=attention_mask,
696
+ position_ids=position_ids,
697
+ past_key_values=past_key_values,
698
+ inputs_embeds=inputs_embeds,
699
+ pixel_values=pixel_values,
700
+ image_sizes=image_sizes,
701
+ use_cache=use_cache,
702
+ output_attentions=output_attentions,
703
+ output_hidden_states=output_hidden_states,
704
+ return_dict=return_dict,
705
+ )
706
+
707
+ hidden_states = outputs[0]
708
+ logits = self.lm_head(hidden_states)
709
+ logits = logits.float()
710
+
711
+ loss = None
712
+
713
+ if not return_dict:
714
+ output = (logits,) + outputs[1:]
715
+ return (loss,) + output if loss is not None else output
716
+
717
+ return CausalLMOutputWithPast(
718
+ loss=loss,
719
+ logits=logits,
720
+ past_key_values=outputs.past_key_values,
721
+ hidden_states=outputs.hidden_states,
722
+ attentions=outputs.attentions,
723
+ )
724
+
725
+ def prepare_inputs_for_generation(
726
+ self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, pixel_values=None, image_sizes=None, **kwargs
727
+ ):
728
+ if past_key_values is not None:
729
+ if isinstance(past_key_values, Cache):
730
+ cache_length = past_key_values.get_seq_length()
731
+ past_length = past_key_values.seen_tokens
732
+ max_cache_length = past_key_values.get_max_length()
733
+ else:
734
+ cache_length = past_length = past_key_values[0][0].shape[2]
735
+ max_cache_length = None
736
+
737
+ if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
738
+ input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
739
+ elif past_length < input_ids.shape[1]:
740
+ input_ids = input_ids[:, past_length:]
741
+
742
+ if (
743
+ max_cache_length is not None
744
+ and attention_mask is not None
745
+ and cache_length + input_ids.shape[1] > max_cache_length
746
+ ):
747
+ attention_mask = attention_mask[:, -max_cache_length:]
748
+
749
+ position_ids = kwargs.get("position_ids", None)
750
+ if attention_mask is not None and position_ids is None:
751
+ position_ids = attention_mask.long().cumsum(-1) - 1
752
+ position_ids.masked_fill_(attention_mask == 0, 1)
753
+ if past_key_values:
754
+ position_ids = position_ids[:, -input_ids.shape[1] :]
755
+
756
+ if inputs_embeds is not None and past_key_values is None:
757
+ model_inputs = {"inputs_embeds": inputs_embeds}
758
+ else:
759
+ model_inputs = {"input_ids": input_ids}
760
+
761
+ model_inputs.update(
762
+ {
763
+ "position_ids": position_ids,
764
+ "past_key_values": past_key_values,
765
+ "use_cache": kwargs.get("use_cache"),
766
+ "attention_mask": attention_mask,
767
+ "pixel_values": pixel_values,
768
+ "image_sizes": image_sizes,
769
+ }
770
+ )
771
+ return model_inputs
772
+
773
+ @staticmethod
774
+ def _reorder_cache(past_key_values, beam_idx):
775
+ reordered_past = ()
776
+ for layer_past in past_key_values:
777
+ reordered_past += (
778
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
779
+ )
780
+ return reordered_past
preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoProcessor": "processing_phi3_v.Phi3VProcessor",
4
+ "AutoImageProcessor": "image_processing_phi3_v.Phi3VImageProcessor"
5
+ },
6
+ "num_crops": 16,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Phi3VImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "processor_class": "Phi3VProcessor",
19
+ "num_img_tokens": 144
20
+ }
processing_phi3_v.py ADDED
@@ -0,0 +1,217 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2024 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """
17
+ Processor class for Phi3-V.
18
+ """
19
+ import re
20
+ from typing import List, Optional, Union
21
+
22
+ import torch
23
+
24
+ import transformers
25
+ from transformers.feature_extraction_utils import BatchFeature
26
+ from transformers.image_utils import ImageInput
27
+ from transformers.processing_utils import ProcessorMixin
28
+ from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
29
+ from transformers.utils import TensorType
30
+ from .image_processing_phi3_v import Phi3VImageProcessor
31
+ transformers.Phi3VImageProcessor = Phi3VImageProcessor
32
+
33
+ class Phi3VProcessor(ProcessorMixin):
34
+ r"""
35
+ Constructs a Phi3-V processor which wraps a Phi3-V image processor and a LLaMa tokenizer into a single processor.
36
+
37
+ [`Phi3VProcessor`] offers all the functionalities of [`Phi3VImageProcessor`] and [`LlamaTokenizerFast`]. See the
38
+ [`~Phi3VProcessor.__call__`] and [`~Phi3VProcessor.decode`] for more information.
39
+
40
+ Args:
41
+ image_processor ([`Phi3VImageProcessor`], *optional*):
42
+ The image processor is a required input.
43
+ tokenizer ([`LlamaTokenizerFast`], *optional*):
44
+ The tokenizer is a required input.
45
+ """
46
+
47
+ attributes = ["image_processor", "tokenizer"]
48
+ image_processor_class = "Phi3VImageProcessor"
49
+ tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
50
+ special_image_token = "<|image|>"
51
+
52
+ def __init__(self, image_processor, tokenizer):
53
+ self.image_processor = image_processor
54
+ self.tokenizer = tokenizer
55
+ self.num_img_tokens = image_processor.num_img_tokens
56
+ self.img_tokens = [f"<|image_{i+1}|>" for i in range(1000000)]
57
+
58
+ def __call__(
59
+ self,
60
+ text: Union[TextInput, List[TextInput]],
61
+ images: ImageInput = None,
62
+ padding: Union[bool, str, PaddingStrategy] = False,
63
+ truncation: Union[bool, str, TruncationStrategy] = None,
64
+ max_length=None,
65
+ return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
66
+ ) -> BatchFeature:
67
+ """
68
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
69
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
70
+ the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
71
+ Phi3ImageProcessor's [`~Phi3ImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
72
+ of the above two methods for more information.
73
+
74
+ Args:
75
+ text (`str`, `List[str]`, `List[List[str]]`):
76
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
77
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
78
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
79
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
80
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
81
+ tensor. Both channels-first and channels-last formats are supported.
82
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
83
+ Select a strategy to pad the returned sequences (according to the model's padding side and padding
84
+ index) among:
85
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
86
+ sequence if provided).
87
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
88
+ acceptable input length for the model if that argument is not provided.
89
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
90
+ lengths).
91
+ max_length (`int`, *optional*):
92
+ Maximum length of the returned list and optionally padding length (see above).
93
+ truncation (`bool`, *optional*):
94
+ Activates truncation to cut input sequences longer than `max_length` to `max_length`.
95
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
96
+ If set, will return tensors of a particular framework. Acceptable values are:
97
+
98
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
99
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
100
+ - `'np'`: Return NumPy `np.ndarray` objects.
101
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
102
+
103
+ Returns:
104
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
105
+
106
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
107
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
108
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
109
+ `None`).
110
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
111
+ """
112
+ if images is not None:
113
+ image_inputs = self.image_processor(images, return_tensors=return_tensors)
114
+ else:
115
+ image_inputs = {}
116
+ inputs = self._convert_images_texts_to_inputs(image_inputs, text, padding=padding, truncation=truncation, max_length=max_length, return_tensors=return_tensors)
117
+ return inputs
118
+
119
+ def calc_num_image_tokens(self, images: ImageInput):
120
+ """ Calculate the number of image tokens for each image.
121
+ Args:
122
+ images (`ImageInput`):
123
+ Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
124
+ passing in images with pixel values between 0 and 1, set `do_rescale=False`.
125
+ """
126
+ return self.image_processor.calc_num_image_tokens(images)
127
+
128
+ def calc_num_image_tokens_from_image_size(self, width, height):
129
+ """ Calculate the number of image token for an image with given width and height.
130
+ Args:
131
+ width (`int`):
132
+ Width of the image.
133
+ height (`int`):
134
+ Height of the image.
135
+ """
136
+ return self.image_processor.calc_num_image_tokens_from_image_size(width, height)
137
+
138
+
139
+ @property
140
+ def special_image_token_id(self):
141
+ return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
142
+
143
+ def get_special_image_token_id(self):
144
+ return self.tokenizer.convert_tokens_to_ids(self.special_image_token)
145
+
146
+ def _convert_images_texts_to_inputs(self, images, texts, padding=False, truncation=None, max_length=None, return_tensors=None):
147
+
148
+ if not len(images):
149
+ model_inputs = self.tokenizer(texts, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length)
150
+ return BatchFeature(data={**model_inputs})
151
+
152
+ pattern = r"<\|image_\d+\|>"
153
+ prompt_chunks = [self.tokenizer(chunk).input_ids for chunk in re.split(pattern, texts)]
154
+
155
+ if 'num_img_tokens' in images:
156
+ num_img_tokens = images['num_img_tokens']
157
+ else:
158
+ assert 'num_crops' in images, 'num_crops must be provided in images if num_img_tokens is not provided'
159
+ num_crops = images['num_crops']
160
+ num_img_tokens = [_num_crops * self.num_img_tokens for _num_crops in num_crops]
161
+
162
+ images, image_sizes = images['pixel_values'], images['image_sizes']
163
+
164
+ # image_tags needs to start from 1 to n
165
+ image_tags = re.findall(pattern, texts)
166
+ # image_ids = [int(s.split("|")[1].split("_")[-1]) * -1 for s in image_tags]
167
+ # image_ids_pad = [[iid]*num_img_tokens[i] for i, iid in enumerate(image_ids)]
168
+ image_ids = [int(s.split("|")[1].split("_")[-1]) for s in image_tags]
169
+ unique_image_ids = sorted(list(set(image_ids)))
170
+ # image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be [1, 4, 5]
171
+ # check the condition
172
+ assert unique_image_ids == list(range(1, len(unique_image_ids)+1)), f"image_ids must start from 1, and must be continuous int, e.g. [1, 2, 3], cannot be {unique_image_ids}"
173
+ # total images must be the same as the number of image tags
174
+ assert len(unique_image_ids) == len(images), f"total images must be the same as the number of image tags, got {len(unique_image_ids)} image tags and {len(images)} images"
175
+
176
+ image_ids_pad = [[-iid]*num_img_tokens[iid-1] for iid in image_ids]
177
+
178
+ def insert_separator(X, sep_list):
179
+ if len(X) > len(sep_list):
180
+ sep_list.append([])
181
+ return [ele for sublist in zip(X, sep_list) for ele in sublist]
182
+ input_ids = []
183
+ offset = 0
184
+ for x in insert_separator(prompt_chunks, image_ids_pad):
185
+ input_ids.extend(x[offset:])
186
+
187
+ input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0)
188
+ attention_mask = (input_ids > -1000000).to(torch.long)
189
+
190
+ return BatchFeature(data={"input_ids": input_ids,
191
+ "attention_mask": attention_mask,
192
+ "pixel_values": images,
193
+ "image_sizes": image_sizes})
194
+
195
+
196
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
197
+ def batch_decode(self, *args, **kwargs):
198
+ """
199
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
200
+ refer to the docstring of this method for more information.
201
+ """
202
+ return self.tokenizer.batch_decode(*args, **kwargs)
203
+
204
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
205
+ def decode(self, *args, **kwargs):
206
+ """
207
+ This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
208
+ the docstring of this method for more information.
209
+ """
210
+ return self.tokenizer.decode(*args, **kwargs)
211
+
212
+ @property
213
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
214
+ def model_input_names(self):
215
+ tokenizer_input_names = self.tokenizer.model_input_names
216
+ image_processor_input_names = self.image_processor.model_input_names
217
+ return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
sample_inference.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ from PIL import Image
4
+ import requests
5
+ import torch
6
+ from transformers import AutoModelForCausalLM
7
+ from transformers import AutoProcessor
8
+ model_path = "./"
9
+
10
+ kwargs = {}
11
+ kwargs['torch_dtype'] = torch.bfloat16
12
+
13
+ processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
14
+ model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True, torch_dtype="auto").cuda()
15
+
16
+ user_prompt = '<|user|>\n'
17
+ assistant_prompt = '<|assistant|>\n'
18
+ prompt_suffix = "<|end|>\n"
19
+
20
+ #################################################### text-only ####################################################
21
+ # single-image prompt
22
+ prompt = f"{user_prompt}what is the answer for 1+1? Explain it.{prompt_suffix}{assistant_prompt}"
23
+ print(f">>> Prompt\n{prompt}")
24
+ inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
25
+ generate_ids = model.generate(**inputs,
26
+ max_new_tokens=1000,
27
+ eos_token_id=processor.tokenizer.eos_token_id,
28
+ )
29
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
30
+ response = processor.batch_decode(generate_ids,
31
+ skip_special_tokens=True,
32
+ clean_up_tokenization_spaces=False)[0]
33
+ print(f'>>> Response\n{response}')
34
+
35
+ #################################################### text-only 2 ####################################################
36
+ # single-image prompt
37
+ prompt = f"{user_prompt}Give me the code for sloving two-sum problem.{prompt_suffix}{assistant_prompt}"
38
+ print(f">>> Prompt\n{prompt}")
39
+ inputs = processor(prompt, images=None, return_tensors="pt").to("cuda:0")
40
+ generate_ids = model.generate(**inputs,
41
+ max_new_tokens=1000,
42
+ eos_token_id=processor.tokenizer.eos_token_id,
43
+ )
44
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
45
+ response = processor.batch_decode(generate_ids,
46
+ skip_special_tokens=True,
47
+ clean_up_tokenization_spaces=False)[0]
48
+ print(f'>>> Response\n{response}')
49
+
50
+
51
+ #################################################### EXAMPLE 1 ####################################################
52
+ # single-image prompt
53
+ prompt = f"{user_prompt}<|image_1|>\nWhat is shown in this image?{prompt_suffix}{assistant_prompt}"
54
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
55
+ print(f">>> Prompt\n{prompt}")
56
+ image = Image.open(requests.get(url, stream=True).raw)
57
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
58
+ generate_ids = model.generate(**inputs,
59
+ max_new_tokens=1000,
60
+ eos_token_id=processor.tokenizer.eos_token_id,
61
+ )
62
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
63
+ response = processor.batch_decode(generate_ids,
64
+ skip_special_tokens=True,
65
+ clean_up_tokenization_spaces=False)[0]
66
+ print(f'>>> Response\n{response}')
67
+
68
+ #################################################### EXAMPLE 2 ####################################################
69
+ # multiple image prompt
70
+ # Note: image tokens must start from <|image_1|>
71
+ prompt = f"{user_prompt}<|image_1|>\n<|image_2|>\n What is shown in this two images?{prompt_suffix}{assistant_prompt}"
72
+ print(f">>> Prompt\n{prompt}")
73
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
74
+ image_1 = Image.open(requests.get(url, stream=True).raw)
75
+ url = "https://img.freepik.com/free-photo/painting-mountain-lake-with-mountain-background_188544-9126.jpg?w=2000"
76
+ image_2 = Image.open(requests.get(url, stream=True).raw)
77
+ images = [image_1, image_2]
78
+ inputs = processor(prompt, images, return_tensors="pt").to("cuda:0")
79
+ generate_ids = model.generate(**inputs,
80
+ max_new_tokens=1000,
81
+ eos_token_id=processor.tokenizer.eos_token_id,
82
+ )
83
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
84
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
85
+ print(f'>>> Response\n{response}')
86
+
87
+ #################################################### EXAMPLE 3 ####################################################
88
+ # chat template
89
+ chat = [
90
+ {"role": "user", "content": "<|image_1|>\nWhat is shown in this image?"},
91
+ {"role": "assistant", "content": "The image depicts a street scene with a prominent red stop sign in the foreground. The background showcases a building with traditional Chinese architecture, characterized by its red roof and ornate decorations. There are also several statues of lions, which are common in Chinese culture, positioned in front of the building. The street is lined with various shops and businesses, and there's a car passing by."},
92
+ {"role": "user", "content": "What is so special about this image"}
93
+ ]
94
+ url = "https://www.ilankelman.org/stopsigns/australia.jpg"
95
+ image = Image.open(requests.get(url, stream=True).raw)
96
+ prompt = processor.tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
97
+ # need to remove last <|endoftext|> if it is there, which is used for training, not inference. For training, make sure to add <|endoftext|> in the end.
98
+ if prompt.endswith("<|endoftext|>"):
99
+ prompt = prompt.rstrip("<|endoftext|>")
100
+
101
+ print(f">>> Prompt\n{prompt}")
102
+
103
+ inputs = processor(prompt, [image], return_tensors="pt").to("cuda:0")
104
+ generate_ids = model.generate(**inputs,
105
+ max_new_tokens=1000,
106
+ eos_token_id=processor.tokenizer.eos_token_id,
107
+ )
108
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
109
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
110
+ print(f'>>> Response\n{response}')
111
+
112
+
113
+ ############################# to markdown #############################
114
+ # single-image prompt
115
+ prompt = f"{user_prompt}<|image_1|>\nCan you convert the table to markdown format?{prompt_suffix}{assistant_prompt}"
116
+ url = "https://support.content.office.net/en-us/media/3dd2b79b-9160-403d-9967-af893d17b580.png"
117
+ image = Image.open(requests.get(url, stream=True).raw)
118
+ inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
119
+
120
+ print(f">>> Prompt\n{prompt}")
121
+ generate_ids = model.generate(**inputs,
122
+ max_new_tokens=1000,
123
+ eos_token_id=processor.tokenizer.eos_token_id,
124
+ )
125
+ generate_ids = generate_ids[:, inputs['input_ids'].shape[1]:]
126
+ response = processor.batch_decode(generate_ids,
127
+ skip_special_tokens=False,
128
+ clean_up_tokenization_spaces=False)[0]
129
+ print(f'>>> Response\n{response}')
special_tokens_map.json ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|system|>",
4
+ "<|end|>",
5
+ "<|user|>",
6
+ "<|end|>"
7
+ ],
8
+ "bos_token": {
9
+ "content": "<s>",
10
+ "lstrip": false,
11
+ "normalized": false,
12
+ "rstrip": false,
13
+ "single_word": false
14
+ },
15
+ "eos_token": {
16
+ "content": "<|endoftext|>",
17
+ "lstrip": false,
18
+ "normalized": false,
19
+ "rstrip": false,
20
+ "single_word": false
21
+ },
22
+ "pad_token": {
23
+ "content": "<|endoftext|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false
28
+ },
29
+ "unk_token": {
30
+ "content": "<unk>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false
35
+ }
36
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,408 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "added_tokens_decoder": {
5
+ "0": {
6
+ "content": "<unk>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "1": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "2": {
22
+ "content": "</s>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": true,
26
+ "single_word": false,
27
+ "special": false
28
+ },
29
+ "32000": {
30
+ "content": "<|endoftext|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "32001": {
38
+ "content": "<|assistant|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": true,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "32002": {
46
+ "content": "<|placeholder1|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": true,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "32003": {
54
+ "content": "<|placeholder2|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": true,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "32004": {
62
+ "content": "<|placeholder3|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": true,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "32005": {
70
+ "content": "<|placeholder4|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": true,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "32006": {
78
+ "content": "<|system|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "32007": {
86
+ "content": "<|end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "32008": {
94
+ "content": "<|placeholder5|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": true,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "32009": {
102
+ "content": "<|placeholder6|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": true,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "32010": {
110
+ "content": "<|user|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "32011": {
118
+ "content": "<|placeholder7|>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": true,
122
+ "single_word": false,
123
+ "special": true
124
+ },
125
+ "32012": {
126
+ "content": "<|placeholder8|>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": true,
130
+ "single_word": false,
131
+ "special": true
132
+ },
133
+ "32013": {
134
+ "content": "<|placeholder9|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": true,
138
+ "single_word": false,
139
+ "special": true
140
+ },
141
+ "32014": {
142
+ "content": "<|placeholder10|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": true,
146
+ "single_word": false,
147
+ "special": true
148
+ },
149
+ "32015": {
150
+ "content": "<|placeholder11|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": true,
154
+ "single_word": false,
155
+ "special": true
156
+ },
157
+ "32016": {
158
+ "content": "<|placeholder12|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": true,
162
+ "single_word": false,
163
+ "special": true
164
+ },
165
+ "32017": {
166
+ "content": "<|placeholder13|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": true,
170
+ "single_word": false,
171
+ "special": true
172
+ },
173
+ "32018": {
174
+ "content": "<|placeholder14|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": true,
178
+ "single_word": false,
179
+ "special": true
180
+ },
181
+ "32019": {
182
+ "content": "<|placeholder15|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": true,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "32020": {
190
+ "content": "<|placeholder16|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": true,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "32021": {
198
+ "content": "<|placeholder17|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": true,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "32022": {
206
+ "content": "<|placeholder18|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": true,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "32023": {
214
+ "content": "<|placeholder19|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": true,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "32024": {
222
+ "content": "<|placeholder20|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": true,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "32025": {
230
+ "content": "<|placeholder21|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": true,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "32026": {
238
+ "content": "<|placeholder22|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": true,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "32027": {
246
+ "content": "<|placeholder23|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": true,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "32028": {
254
+ "content": "<|placeholder24|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": true,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "32029": {
262
+ "content": "<|placeholder25|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": true,
266
+ "single_word": false,
267
+ "special": true
268
+ },
269
+ "32030": {
270
+ "content": "<|placeholder26|>",
271
+ "lstrip": false,
272
+ "normalized": false,
273
+ "rstrip": true,
274
+ "single_word": false,
275
+ "special": true
276
+ },
277
+ "32031": {
278
+ "content": "<|placeholder27|>",
279
+ "lstrip": false,
280
+ "normalized": false,
281
+ "rstrip": true,
282
+ "single_word": false,
283
+ "special": true
284
+ },
285
+ "32032": {
286
+ "content": "<|placeholder28|>",
287
+ "lstrip": false,
288
+ "normalized": false,
289
+ "rstrip": true,
290
+ "single_word": false,
291
+ "special": true
292
+ },
293
+ "32033": {
294
+ "content": "<|placeholder29|>",
295
+ "lstrip": false,
296
+ "normalized": false,
297
+ "rstrip": true,
298
+ "single_word": false,
299
+ "special": true
300
+ },
301
+ "32034": {
302
+ "content": "<|placeholder30|>",
303
+ "lstrip": false,
304
+ "normalized": false,
305
+ "rstrip": true,
306
+ "single_word": false,
307
+ "special": true
308
+ },
309
+ "32035": {
310
+ "content": "<|placeholder31|>",
311
+ "lstrip": false,
312
+ "normalized": false,
313
+ "rstrip": true,
314
+ "single_word": false,
315
+ "special": true
316
+ },
317
+ "32036": {
318
+ "content": "<|placeholder32|>",
319
+ "lstrip": false,
320
+ "normalized": false,
321
+ "rstrip": true,
322
+ "single_word": false,
323
+ "special": true
324
+ },
325
+ "32037": {
326
+ "content": "<|placeholder33|>",
327
+ "lstrip": false,
328
+ "normalized": false,
329
+ "rstrip": true,
330
+ "single_word": false,
331
+ "special": true
332
+ },
333
+ "32038": {
334
+ "content": "<|placeholder34|>",
335
+ "lstrip": false,
336
+ "normalized": false,
337
+ "rstrip": true,
338
+ "single_word": false,
339
+ "special": true
340
+ },
341
+ "32039": {
342
+ "content": "<|placeholder35|>",
343
+ "lstrip": false,
344
+ "normalized": false,
345
+ "rstrip": true,
346
+ "single_word": false,
347
+ "special": true
348
+ },
349
+ "32040": {
350
+ "content": "<|placeholder36|>",
351
+ "lstrip": false,
352
+ "normalized": false,
353
+ "rstrip": true,
354
+ "single_word": false,
355
+ "special": true
356
+ },
357
+ "32041": {
358
+ "content": "<|placeholder37|>",
359
+ "lstrip": false,
360
+ "normalized": false,
361
+ "rstrip": true,
362
+ "single_word": false,
363
+ "special": true
364
+ },
365
+ "32042": {
366
+ "content": "<|placeholder38|>",
367
+ "lstrip": false,
368
+ "normalized": false,
369
+ "rstrip": true,
370
+ "single_word": false,
371
+ "special": true
372
+ },
373
+ "32043": {
374
+ "content": "<|placeholder39|>",
375
+ "lstrip": false,
376
+ "normalized": false,
377
+ "rstrip": true,
378
+ "single_word": false,
379
+ "special": true
380
+ },
381
+ "32044": {
382
+ "content": "<|image|>",
383
+ "lstrip": false,
384
+ "normalized": false,
385
+ "rstrip": true,
386
+ "single_word": false,
387
+ "special": true
388
+ }
389
+ },
390
+ "additional_special_tokens": [
391
+ "<|system|>",
392
+ "<|end|>",
393
+ "<|user|>",
394
+ "<|end|>"
395
+ ],
396
+ "bos_token": "<s>",
397
+ "chat_template": "{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
398
+ "clean_up_tokenization_spaces": false,
399
+ "eos_token": "<|endoftext|>",
400
+ "legacy": false,
401
+ "model_max_length": 131072,
402
+ "pad_token": "<|endoftext|>",
403
+ "padding_side": "right",
404
+ "sp_model_kwargs": {},
405
+ "tokenizer_class": "LlamaTokenizer",
406
+ "unk_token": "<unk>",
407
+ "use_default_system_prompt": false
408
+ }
utils.py ADDED
@@ -0,0 +1,255 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+ import inspect
3
+ import logging
4
+ from typing import Optional, List, Union, Dict, Tuple, Any
5
+ from transformers.configuration_utils import PretrainedConfig
6
+ import mlx.core as mx
7
+
8
+
9
+ # Define a custom float tensor type using the provided data type
10
+ class FloatTensor:
11
+ def __init__(self, data):
12
+ if data is not None:
13
+ self.tensor = mx.array(data, dtype=mx.float32)
14
+ else:
15
+ self.tensor = None
16
+
17
+ def __repr__(self):
18
+ return repr(self.tensor)
19
+
20
+ # Define a custom LongTensor class
21
+ class LongTensor:
22
+ def __init__(self, data=None):
23
+ if data is not None:
24
+ self.tensor = mx.array(data, dtype=mx.int64)
25
+ else:
26
+ self.tensor = None
27
+
28
+ def assign(self, data):
29
+ self.tensor = mx.array(data, dtype=mx.int64)
30
+
31
+ def __repr__(self):
32
+ return repr(self.tensor)
33
+
34
+ @dataclass
35
+ class BaseModelOutputWithPast:
36
+ """
37
+ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
38
+
39
+ Args:
40
+ last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
41
+ Sequence of hidden-states at the output of the last layer of the model.
42
+
43
+ If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
44
+ hidden_size)` is output.
45
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
46
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
47
+ `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
48
+ `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
49
+ encoder_sequence_length, embed_size_per_head)`.
50
+
51
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
52
+ `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
53
+ input) to speed up sequential decoding.
54
+ hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
55
+ Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
56
+ one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
57
+
58
+ Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
59
+ attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
60
+ Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
61
+ sequence_length)`.
62
+
63
+ Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
64
+ heads.
65
+ """
66
+
67
+ last_hidden_state: FloatTensor = None
68
+ past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None
69
+ hidden_states: Optional[Tuple[FloatTensor, ...]] = None
70
+ attentions: Optional[Tuple[FloatTensor, ...]] = None
71
+
72
+
73
+ @dataclass
74
+ class Cache:
75
+ """
76
+ Base, abstract class for all caches. The actual data structure is specific to each subclass.
77
+ """
78
+
79
+ def update(
80
+ self,
81
+ key_states: mx.array,
82
+ value_states: mx.array,
83
+ layer_idx: int,
84
+ cache_kwargs: Optional[Dict[str, Any]] = None,
85
+ ) -> Tuple[mx.array, mx.array]:
86
+ """
87
+ Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
88
+
89
+ Parameters:
90
+ key_states (`mx.array`):
91
+ The new key states to cache.
92
+ value_states (`mx.array`):
93
+ The new value states to cache.
94
+ layer_idx (`int`):
95
+ The index of the layer to cache the states for.
96
+ cache_kwargs (`Dict[str, Any]`, `optional`):
97
+ Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
98
+ cache to be created.
99
+
100
+ Return:
101
+ A tuple containing the updated key and value states.
102
+ """
103
+ raise NotImplementedError("Make sure to implement `update` in a subclass.")
104
+
105
+ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
106
+ """Returns the sequence length of the cached states. A layer index can be optionally passed."""
107
+ # TODO: deprecate this function in favor of `cache_position`
108
+ raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
109
+
110
+ def get_max_length(self) -> Optional[int]:
111
+ """Returns the maximum sequence length of the cached states, if there is any."""
112
+ raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
113
+
114
+ def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
115
+ """Given the sequence length of the new inputs, returns the usable length of the cache."""
116
+ # Cache without size limit -> all cache is usable
117
+ # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
118
+ # length, we will need to evict part of the cache (and thus not all cache is usable)
119
+ max_length = self.get_max_length()
120
+ previous_seq_length = self.get_seq_length(layer_idx)
121
+ if max_length is not None and previous_seq_length + new_seq_length > max_length:
122
+ return max_length - new_seq_length
123
+ return previous_seq_length
124
+
125
+ # def reorder_cache(self, beam_idx: LongTensor):
126
+ # """Reorders the cache for beam search, given the selected beam indices."""
127
+ # for layer_idx in range(len(self.key_cache)):
128
+ # device = self.key_cache[layer_idx].device
129
+ # self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
130
+ # device = self.value_cache[layer_idx].device
131
+ # self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
132
+ @property
133
+ def seen_tokens(self):
134
+ logging.warning(
135
+ "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` "
136
+ "model input instead."
137
+ )
138
+ if hasattr(self, "_seen_tokens"):
139
+ return self._seen_tokens
140
+ else:
141
+ return None
142
+
143
+
144
+ class DynamicCache(Cache):
145
+ """
146
+ A cache that grows dynamically as more tokens are generated. This is the default for generative models.
147
+
148
+ It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
149
+ `[batch_size, num_heads, seq_len, head_dim]`.
150
+ """
151
+
152
+ def __init__(self) -> None:
153
+ self.key_cache: List[mx.array] = []
154
+ self.value_cache: List[mx.array] = []
155
+ self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen
156
+
157
+ def __getitem__(self, layer_idx: int) -> List[Tuple[mx.array]]:
158
+ """
159
+ Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
160
+ sequence length.
161
+ """
162
+ if layer_idx < len(self):
163
+ return (self.key_cache[layer_idx], self.value_cache[layer_idx])
164
+ else:
165
+ raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
166
+
167
+ def __iter__(self):
168
+ """
169
+ Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
170
+ keys and values
171
+ """
172
+ for layer_idx in range(len(self)):
173
+ yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
174
+
175
+ def __len__(self):
176
+ """
177
+ Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
178
+ to the number of layers in the model.
179
+ """
180
+ return len(self.key_cache)
181
+
182
+ def update(
183
+ self,
184
+ key_states: mx.array,
185
+ value_states: mx.array,
186
+ layer_idx: int,
187
+ cache_kwargs: Optional[Dict[str, Any]] = None,
188
+ ) -> Tuple[mx.array, mx.array]:
189
+ """
190
+ Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
191
+
192
+ Parameters:
193
+ key_states (`mx.array`):
194
+ The new key states to cache.
195
+ value_states (`mx.array`):
196
+ The new value states to cache.
197
+ layer_idx (`int`):
198
+ The index of the layer to cache the states for.
199
+ cache_kwargs (`Dict[str, Any]`, `optional`):
200
+ Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
201
+
202
+ Return:
203
+ A tuple containing the updated key and value states.
204
+ """
205
+ # Update the number of seen tokens
206
+ if layer_idx == 0:
207
+ self._seen_tokens += key_states.shape[-2]
208
+
209
+ # Update the cache
210
+ if len(self.key_cache) <= layer_idx:
211
+ self.key_cache.append(key_states)
212
+ self.value_cache.append(value_states)
213
+ else:
214
+ self.key_cache[layer_idx] = mx.concatenate([self.key_cache[layer_idx], key_states], dim=-2)
215
+ self.value_cache[layer_idx] = mx.concatenate([self.value_cache[layer_idx], value_states], dim=-2)
216
+
217
+ return self.key_cache[layer_idx], self.value_cache[layer_idx]
218
+
219
+ def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
220
+ """Returns the sequence length of the cached states. A layer index can be optionally passed."""
221
+ # TODO: deprecate this function in favor of `cache_position`
222
+ if len(self.key_cache) <= layer_idx:
223
+ return 0
224
+ return self.key_cache[layer_idx].shape[-2]
225
+
226
+ def get_max_length(self) -> Optional[int]:
227
+ """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
228
+ return None
229
+
230
+ def to_legacy_cache(self) -> Tuple[Tuple[mx.array], Tuple[mx.array]]:
231
+ """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
232
+ legacy_cache = ()
233
+ for layer_idx in range(len(self)):
234
+ legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
235
+ return legacy_cache
236
+
237
+ @classmethod
238
+ def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None) -> "DynamicCache":
239
+ """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
240
+ cache = cls()
241
+ if past_key_values is not None:
242
+ for layer_idx in range(len(past_key_values)):
243
+ key_states, value_states = past_key_values[layer_idx]
244
+ cache.update(key_states, value_states, layer_idx)
245
+ return cache
246
+
247
+
248
+ @dataclass
249
+ class CausalLMOutputWithPast():
250
+
251
+ loss: Optional[FloatTensor] = None
252
+ logits: FloatTensor = None
253
+ past_key_values: Optional[Tuple[Tuple[FloatTensor]]] = None
254
+ hidden_states: Optional[Tuple[FloatTensor, ...]] = None
255
+ attentions: Optional[Tuple[FloatTensor, ...]] = None