Upload 17 files

Browse files

Files changed (10) hide show

config.json +5 -4
configuration_minicpm.py +3 -2
generation_config.json +1 -1
image_processing_minicpmv.py +55 -88
model.safetensors +2 -2
modeling_minicpmv.py +99 -83
modeling_navit_siglip.py +19 -17
processing_minicpmv.py +57 -52
resampler.py +226 -150
tokenization_minicpmv_fast.py +5 -5

config.json CHANGED Viewed

@@ -1,5 +1,5 @@
 {
-  "_name_or_path": "minicpm_v26",
   "architectures": [
     "MiniCPMV"
   ],
@@ -17,7 +17,7 @@
   "hidden_size": 256,
   "image_size": 28,
   "initializer_range": 0.02,
-  "intermediate_size": 37,
   "max_position_embeddings": 32768,
   "max_window_layers": 2,
   "model_type": "minicpmv",
@@ -37,16 +37,17 @@
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
-  "transformers_version": "4.45.1",
   "use_cache": true,
   "use_image_id": true,
   "use_sliding_window": false,
   "version": 2.6,
   "vision_batch_size": 16,
   "vision_config": {
     "hidden_size": 64,
     "image_size": 28,
-    "intermediate_size": 4304,
     "model_type": "siglip_vision_model",
     "num_attention_heads": 2,
     "num_hidden_layers": 4,

 {
+  "_name_or_path": "/home/ea/work/my_optimum_intel/optimum-intel/tiny-random-minicpmv-2_6",
   "architectures": [
     "MiniCPMV"
   ],
   "hidden_size": 256,
   "image_size": 28,
   "initializer_range": 0.02,
+  "intermediate_size": 128,
   "max_position_embeddings": 32768,
   "max_window_layers": 2,
   "model_type": "minicpmv",
   "sliding_window": null,
   "tie_word_embeddings": false,
   "torch_dtype": "float32",
+  "transformers_version": "4.46.1",
   "use_cache": true,
   "use_image_id": true,
   "use_sliding_window": false,
   "version": 2.6,
   "vision_batch_size": 16,
   "vision_config": {
+    "_attn_implementation_autoset": true,
     "hidden_size": 64,
     "image_size": 28,
+    "intermediate_size": 128,
     "model_type": "siglip_vision_model",
     "num_attention_heads": 2,
     "num_hidden_layers": 4,

configuration_minicpm.py CHANGED Viewed

@@ -4,10 +4,12 @@
 import os
 from typing import Union
 from transformers.utils import logging
-from transformers import Qwen2Config, PretrainedConfig
 from .modeling_navit_siglip import SiglipVisionConfig
 logger = logging.get_logger(__name__)
@@ -44,7 +46,6 @@ class MiniCPMVSliceConfig(PretrainedConfig):
         return cls.from_dict(config_dict, **kwargs)
 class MiniCPMVConfig(Qwen2Config):
     model_type = "minicpmv"
     keys_to_ignore_at_inference = ["past_key_values"]

 import os
 from typing import Union
+from transformers import PretrainedConfig, Qwen2Config
 from transformers.utils import logging
 from .modeling_navit_siglip import SiglipVisionConfig
 logger = logging.get_logger(__name__)
         return cls.from_dict(config_dict, **kwargs)
 class MiniCPMVConfig(Qwen2Config):
     model_type = "minicpmv"
     keys_to_ignore_at_inference = ["past_key_values"]

generation_config.json CHANGED Viewed

@@ -2,5 +2,5 @@
   "_from_model_config": true,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
-  "transformers_version": "4.45.1"
 }

   "_from_model_config": true,
   "bos_token_id": 151643,
   "eos_token_id": 151645,
+  "transformers_version": "4.46.1"
 }

image_processing_minicpmv.py CHANGED Viewed

@@ -1,27 +1,23 @@
-from typing import Optional, Union, Dict, Any, List
-import torch
 import math
-import PIL.Image
-import PIL.ImageSequence
 import numpy as np
 import PIL
 from PIL import Image
-from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers import AutoImageProcessor
 from transformers.image_transforms import to_channel_dimension_format
 from transformers.image_utils import (
-    ImageInput,
-    make_list_of_images,
-    valid_images,
-    is_torch_tensor,
-    is_batched,
-    to_numpy_array,
     infer_channel_dimension_format,
-    ChannelDimension
 )
 def recursive_converter(converter, value):
@@ -38,6 +34,7 @@ class MiniCPMVBatchFeature(BatchFeature):
     r"""
     Extend from BatchFeature for supporting various image size
     """
     def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
         super().__init__(data)
         self.convert_to_tensors(tensor_type=tensor_type)
@@ -45,7 +42,7 @@ class MiniCPMVBatchFeature(BatchFeature):
     def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
         if tensor_type is None:
             return self
         is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
         def converter(value):
@@ -61,11 +58,10 @@ class MiniCPMVBatchFeature(BatchFeature):
                     "with 'padding=True' to have batched tensors with the same length."
                 )
         for key, value in self.items():
             self[key] = recursive_converter(converter, value)
         return self
     def to(self, *args, **kwargs) -> "MiniCPMVBatchFeature":
         requires_backends(self, ["torch"])
         import torch
@@ -104,12 +100,7 @@ class MiniCPMVBatchFeature(BatchFeature):
 class MiniCPMVImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
-    def __init__(
-            self,
-            max_slice_nums=9,
-            scale_resolution=448,
-            patch_size=14,
-            **kwargs):
         super().__init__(**kwargs)
         self.max_slice_nums = max_slice_nums
         self.scale_resolution = scale_resolution
@@ -131,14 +122,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
     def ensure_divide(self, length, patch_size):
         return max(round(length / patch_size) * patch_size, patch_size)
-    def find_best_resize(self,
-                         original_size,
-                         scale_resolution,
-                         patch_size,
-                         allow_upscale=False):
         width, height = original_size
-        if (width * height >
-                scale_resolution * scale_resolution) or allow_upscale:
             r = width / height
             height = int(scale_resolution / math.sqrt(r))
             width = int(height * r)
@@ -146,12 +132,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         best_height = self.ensure_divide(height, patch_size)
         return (best_width, best_height)
-    def get_refine_size(self,
-                        original_size,
-                        grid,
-                        scale_resolution,
-                        patch_size,
-                        allow_upscale=False):
         width, height = original_size
         grid_x, grid_y = grid
@@ -161,10 +142,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         grid_width = refine_width / grid_x
         grid_height = refine_height / grid_y
-        best_grid_size = self.find_best_resize((grid_width, grid_height),
-                                               scale_resolution,
-                                               patch_size,
-                                               allow_upscale=allow_upscale)
         refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
         return refine_size
@@ -182,9 +162,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             patches.append(images)
         return patches
-    def slice_image(
-        self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False
-    ):
         original_size = image.size
         source_image = None
         best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
@@ -192,9 +170,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         if best_grid is None:
             # dont need to slice, upsample
-            best_size = self.find_best_resize(
-                original_size, scale_resolution, patch_size, allow_upscale=True
-            )
             source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
         else:
             # source image, down-sampling and ensure divided by patch_size
@@ -212,9 +188,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         if grid is None:
             return ""
         slice_image_placeholder = (
-            self.slice_start_token
-            + self.unk_token * self.image_feature_size
-            + self.slice_end_token
         )
         cols = grid[0]
@@ -225,13 +199,13 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             for j in range(cols):
                 lines.append(slice_image_placeholder)
             slices.append("".join(lines))
         slice_placeholder = "\n".join(slices)
         return slice_placeholder
     def get_image_id_placeholder(self, idx=0):
         return f"{self.im_id_start}{idx}{self.im_id_end}"
     def get_sliced_images(self, image, max_slice_nums=None):
         slice_images = []
@@ -239,12 +213,9 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             return [image]
         max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
-        assert max_slice_nums > 0
         source_image, patches, sliced_grid = self.slice_image(
-            image,
-            max_slice_nums,  # default: 9
-            self.scale_resolution,  # default: 448
-            self.patch_size  # default: 14
         )
         slice_images.append(source_image)
@@ -266,7 +237,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             if i == 1 or i > max_slice_nums:
                 continue
             candidate_split_grids_nums.append(i)
         candidate_grids = []
         for split_grids_nums in candidate_split_grids_nums:
             m = 1
@@ -282,19 +253,15 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
             if error < min_error:
                 best_grid = grid
                 min_error = error
         return best_grid
     def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
         max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
-        assert max_slice_nums > 0
         grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
-        image_placeholder = (
-            self.im_start_token
-            + self.unk_token * self.image_feature_size
-            + self.im_end_token
-        )
         use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
         if use_image_id:
             final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
@@ -304,7 +271,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         if self.slice_mode:
             final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
         return final_placeholder
     def to_pil_image(self, image, rescale=None) -> PIL.Image.Image:
         """
         Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
@@ -343,24 +310,20 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         """
         image = torch.from_numpy(image)
         patch_size = self.patch_size
-        patches = torch.nn.functional.unfold(
-            image,
-            (patch_size, patch_size),
-            stride=(patch_size, patch_size)
-        )
         patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
         patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
         return patches.numpy()
     def preprocess(
-            self,
-            images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
-            do_pad: Optional[bool] = True, # TODO: add pad for MiniCPM-Llama3-V-2_5
-            max_slice_nums: int = None,
-            return_tensors: Optional[Union[str, TensorType]] = None,
-            **kwargs
-        ) -> MiniCPMVBatchFeature:
         if isinstance(images, Image.Image):
             images_list = [[images]]
         elif isinstance(images[0], Image.Image):
@@ -371,19 +334,19 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
         new_images_list = []
         image_sizes_list = []
         tgt_sizes_list = []
         for _images in images_list:
             if _images is None or len(_images) == 0:
                 new_images_list.append([])
                 image_sizes_list.append([])
                 tgt_sizes_list.append([])
-                continue
             if not valid_images(_images):
                 raise ValueError(
                     "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                     "torch.Tensor, tf.Tensor or jax.ndarray."
                 )
             _images = [self.to_pil_image(image).convert("RGB") for image in _images]
             input_data_format = infer_channel_dimension_format(np.array(_images[0]))
@@ -395,24 +358,28 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
                 image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
                 image_patches = [
                     self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
-                        for image in image_patches
                 ]
                 image_patches = [
-                    to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
-                        for image in image_patches
                 ]
                 for slice_image in image_patches:
                     new_images.append(self.reshape_by_patch(slice_image))
-                    tgt_sizes.append(np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size)))
             if tgt_sizes:
                 tgt_sizes = np.vstack(tgt_sizes)
             new_images_list.append(new_images)
             image_sizes_list.append(image_sizes)
             tgt_sizes_list.append(tgt_sizes)
         return MiniCPMVBatchFeature(
-            data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list}, tensor_type=return_tensors
         )
 AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)

 import math
+from typing import Any, Dict, List, Optional, Union
 import numpy as np
 import PIL
+import PIL.Image
+import PIL.ImageSequence
+import torch
 from PIL import Image
 from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
 from transformers.image_transforms import to_channel_dimension_format
 from transformers.image_utils import (
+    ChannelDimension,
     infer_channel_dimension_format,
+    is_torch_tensor,
+    to_numpy_array,
+    valid_images,
 )
+from transformers.utils import TensorType, is_torch_device, is_torch_dtype, requires_backends
 def recursive_converter(converter, value):
     r"""
     Extend from BatchFeature for supporting various image size
     """
     def __init__(self, data: Optional[Dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
         super().__init__(data)
         self.convert_to_tensors(tensor_type=tensor_type)
     def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
         if tensor_type is None:
             return self
         is_tensor, as_tensor = self._get_is_as_tensor_fns(tensor_type)
         def converter(value):
                     "with 'padding=True' to have batched tensors with the same length."
                 )
         for key, value in self.items():
             self[key] = recursive_converter(converter, value)
         return self
     def to(self, *args, **kwargs) -> "MiniCPMVBatchFeature":
         requires_backends(self, ["torch"])
         import torch
 class MiniCPMVImageProcessor(BaseImageProcessor):
     model_input_names = ["pixel_values"]
+    def __init__(self, max_slice_nums=9, scale_resolution=448, patch_size=14, **kwargs):
         super().__init__(**kwargs)
         self.max_slice_nums = max_slice_nums
         self.scale_resolution = scale_resolution
     def ensure_divide(self, length, patch_size):
         return max(round(length / patch_size) * patch_size, patch_size)
+    def find_best_resize(self, original_size, scale_resolution, patch_size, allow_upscale=False):
         width, height = original_size
+        if (width * height > scale_resolution * scale_resolution) or allow_upscale:
             r = width / height
             height = int(scale_resolution / math.sqrt(r))
             width = int(height * r)
         best_height = self.ensure_divide(height, patch_size)
         return (best_width, best_height)
+    def get_refine_size(self, original_size, grid, scale_resolution, patch_size, allow_upscale=False):
         width, height = original_size
         grid_x, grid_y = grid
         grid_width = refine_width / grid_x
         grid_height = refine_height / grid_y
+        best_grid_size = self.find_best_resize(
+            (grid_width, grid_height), scale_resolution, patch_size, allow_upscale=allow_upscale
+        )
         refine_size = (best_grid_size[0] * grid_x, best_grid_size[1] * grid_y)
         return refine_size
             patches.append(images)
         return patches
+    def slice_image(self, image, max_slice_nums=9, scale_resolution=448, patch_size=14, never_split=False):
         original_size = image.size
         source_image = None
         best_grid = self.get_sliced_grid(original_size, max_slice_nums, never_split)
         if best_grid is None:
             # dont need to slice, upsample
+            best_size = self.find_best_resize(original_size, scale_resolution, patch_size, allow_upscale=True)
             source_image = image.resize(best_size, resample=Image.Resampling.BICUBIC)
         else:
             # source image, down-sampling and ensure divided by patch_size
         if grid is None:
             return ""
         slice_image_placeholder = (
+            self.slice_start_token + self.unk_token * self.image_feature_size + self.slice_end_token
         )
         cols = grid[0]
             for j in range(cols):
                 lines.append(slice_image_placeholder)
             slices.append("".join(lines))
         slice_placeholder = "\n".join(slices)
         return slice_placeholder
     def get_image_id_placeholder(self, idx=0):
         return f"{self.im_id_start}{idx}{self.im_id_end}"
     def get_sliced_images(self, image, max_slice_nums=None):
         slice_images = []
             return [image]
         max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
         source_image, patches, sliced_grid = self.slice_image(
+            image, max_slice_nums, self.scale_resolution, self.patch_size  # default: 9  # default: 448  # default: 14
         )
         slice_images.append(source_image)
             if i == 1 or i > max_slice_nums:
                 continue
             candidate_split_grids_nums.append(i)
         candidate_grids = []
         for split_grids_nums in candidate_split_grids_nums:
             m = 1
             if error < min_error:
                 best_grid = grid
                 min_error = error
         return best_grid
     def get_slice_image_placeholder(self, image_size, image_idx=0, max_slice_nums=None, use_image_id=None):
         max_slice_nums = self.max_slice_nums if max_slice_nums is None else int(max_slice_nums)
+        assert max_slice_nums > 0
         grid = self.get_sliced_grid(image_size=image_size, max_slice_nums=max_slice_nums)
+        image_placeholder = self.im_start_token + self.unk_token * self.image_feature_size + self.im_end_token
         use_image_id = self.use_image_id if use_image_id is None else bool(use_image_id)
         if use_image_id:
             final_placeholder = self.get_image_id_placeholder(image_idx) + image_placeholder
         if self.slice_mode:
             final_placeholder = final_placeholder + self.get_grid_placeholder(grid=grid)
         return final_placeholder
     def to_pil_image(self, image, rescale=None) -> PIL.Image.Image:
         """
         Converts `image` to a PIL Image. Optionally rescales it and puts the channel dimension back as the last axis if
         """
         image = torch.from_numpy(image)
         patch_size = self.patch_size
+        patches = torch.nn.functional.unfold(image, (patch_size, patch_size), stride=(patch_size, patch_size))
         patches = patches.reshape(image.size(0), patch_size, patch_size, -1)
         patches = patches.permute(0, 1, 3, 2).reshape(image.size(0), patch_size, -1)
         return patches.numpy()
     def preprocess(
+        self,
+        images: Union[Image.Image, List[Image.Image], List[List[Image.Image]]],
+        do_pad: Optional[bool] = True,  # TODO: add pad for MiniCPM-Llama3-V-2_5
+        max_slice_nums: int = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> MiniCPMVBatchFeature:
         if isinstance(images, Image.Image):
             images_list = [[images]]
         elif isinstance(images[0], Image.Image):
         new_images_list = []
         image_sizes_list = []
         tgt_sizes_list = []
         for _images in images_list:
             if _images is None or len(_images) == 0:
                 new_images_list.append([])
                 image_sizes_list.append([])
                 tgt_sizes_list.append([])
+                continue
             if not valid_images(_images):
                 raise ValueError(
                     "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
                     "torch.Tensor, tf.Tensor or jax.ndarray."
                 )
             _images = [self.to_pil_image(image).convert("RGB") for image in _images]
             input_data_format = infer_channel_dimension_format(np.array(_images[0]))
                 image_patches = [to_numpy_array(image).astype(np.float32) / 255 for image in image_patches]
                 image_patches = [
                     self.normalize(image=image, mean=self.mean, std=self.std, input_data_format=input_data_format)
+                    for image in image_patches
                 ]
                 image_patches = [
+                    to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                    for image in image_patches
                 ]
                 for slice_image in image_patches:
                     new_images.append(self.reshape_by_patch(slice_image))
+                    tgt_sizes.append(
+                        np.array((slice_image.shape[1] // self.patch_size, slice_image.shape[2] // self.patch_size))
+                    )
             if tgt_sizes:
                 tgt_sizes = np.vstack(tgt_sizes)
             new_images_list.append(new_images)
             image_sizes_list.append(image_sizes)
             tgt_sizes_list.append(tgt_sizes)
         return MiniCPMVBatchFeature(
+            data={"pixel_values": new_images_list, "image_sizes": image_sizes_list, "tgt_sizes": tgt_sizes_list},
+            tensor_type=return_tensors,
         )
 AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:244f72a0389de521d87c3411aaf425ebb85e19144f557f6ed0363ce84eb385f5
-size 323558976

 version https://git-lfs.github.com/spec/v1
+oid sha256:5a13c2a624f4445809755648b73465369b127bf9f4c7a6a87ccf0c7498039149
+size 315498808

modeling_minicpmv.py CHANGED Viewed

@@ -1,20 +1,17 @@
-import math
-from typing import List, Optional
 import json
-import torch
-import torchvision
-from threading import Thread
 from copy import deepcopy
 from PIL import Image
-from transformers import AutoProcessor, Qwen2PreTrainedModel, Qwen2ForCausalLM, TextIteratorStreamer
 from .configuration_minicpm import MiniCPMVConfig
 from .modeling_navit_siglip import SiglipVisionTransformer
 from .resampler import Resampler
 class MiniCPMVPreTrainedModel(Qwen2PreTrainedModel):
     config_class = MiniCPMVConfig
@@ -29,21 +26,21 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
         self.processor = None
-        self.terminators = ['<|im_end|>', '<|endoftext|>']
     def init_vision_module(self):
         # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
-        if self.config._attn_implementation == 'flash_attention_2':
-            self.config.vision_config._attn_implementation = 'flash_attention_2'
         else:
             # not suport sdpa
-            self.config.vision_config._attn_implementation = 'eager'
         model = SiglipVisionTransformer(self.config.vision_config)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
-        setattr(model, 'embed_dim', model.embeddings.embed_dim)
-        setattr(model, 'patch_size', model.embeddings.patch_size)
         return model
@@ -53,7 +50,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             embed_dim=embed_dim,
             num_heads=embed_dim // 128,
             kv_dim=vision_dim,
-            adaptive=True
         )
     def get_input_embeddings(self):
@@ -75,11 +72,11 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         return self.llm
     def get_vllm_embedding(self, data):
-        if 'vision_hidden_states' not in data:
             dtype = self.llm.model.embed_tokens.weight.dtype
             device = self.llm.model.embed_tokens.weight.device
-            tgt_sizes = data['tgt_sizes']
-            pixel_values_list = data['pixel_values']
             vision_hidden_states = []
             all_pixel_values = []
             img_cnt = []
@@ -94,14 +91,15 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
-                all_pixel_values = torch.nn.utils.rnn.pad_sequence(all_pixel_values, batch_first=True,
-                                                                   padding_value=0.0)
                 B, L, _ = all_pixel_values.shape
                 all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
                 patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
                 for i in range(B):
-                    patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
                 vision_batch_size = self.config.vision_batch_size
                 all_pixel_values = all_pixel_values.type(dtype)
@@ -110,28 +108,33 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                     for i in range(0, B, vision_batch_size):
                         start_idx = i
                         end_idx = i + vision_batch_size
-                        tmp_hs = self.vpm(all_pixel_values[start_idx:end_idx], patch_attention_mask=patch_attn_mask[start_idx:end_idx], tgt_sizes=tgt_sizes[start_idx:end_idx]).last_hidden_state
                         hs.append(tmp_hs)
                     vision_embedding = torch.cat(hs, dim=0)
                 else:
-                    vision_embedding = self.vpm(all_pixel_values, patch_attention_mask=patch_attn_mask, tgt_sizes=tgt_sizes).last_hidden_state
                 vision_embedding = self.resampler(vision_embedding, tgt_sizes)
                 start = 0
                 for pixel_values in pixel_values_list:
                     img_cnt = len(pixel_values)
                     if img_cnt > 0:
-                        vision_hidden_states.append(vision_embedding[start: start + img_cnt])
                         start += img_cnt
                     else:
                         vision_hidden_states.append([])
-            else: # no image
                 if self.training:
-                    dummy_image = torch.zeros(
-                        (1, 3, 224, 224),
-                        device=device, dtype=dtype
-                    )
-                    tgt_sizes = torch.Tensor([[(224 // self.config.patch_size), math.ceil(224 / self.config.patch_size)]]).type(torch.int32)
                     dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
                 else:
                     dummy_feature = []
@@ -139,29 +142,33 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                     vision_hidden_states.append(dummy_feature)
         else:
-            vision_hidden_states = data['vision_hidden_states']
-        if hasattr(self.llm.config, 'scale_emb'):
-            vllm_embedding = self.llm.model.embed_tokens(data['input_ids']) * self.llm.config.scale_emb
         else:
-            vllm_embedding = self.llm.model.embed_tokens(data['input_ids'])
-        vision_hidden_states = [i.type(vllm_embedding.dtype) if isinstance(
-            i, torch.Tensor) else i for i in vision_hidden_states]
-        bs = len(data['input_ids'])
         for i in range(bs):
             cur_vs_hs = vision_hidden_states[i]
             if len(cur_vs_hs) > 0:
                 cur_vllm_emb = vllm_embedding[i]
-                cur_image_bound = data['image_bound'][i]
                 if len(cur_image_bound) > 0:
                     image_indices = torch.stack(
                         [torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]
                     ).to(vllm_embedding.device)
-                    cur_vllm_emb.scatter_(0, image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
-                                          cur_vs_hs.view(-1, cur_vs_hs.shape[-1]))
                 elif self.training:
                     cur_vllm_emb += cur_vs_hs[0].mean() * 0
@@ -173,13 +180,8 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         if position_ids.dtype != torch.int64:
             position_ids = position_ids.long()
-        return self.llm(
-            input_ids=None,
-            position_ids=position_ids,
-            inputs_embeds=vllm_embedding,
-            **kwargs
-        )
     def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs):
         terminators = None
         if tokenizer is not None:
@@ -187,10 +189,10 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         kwargs.pop("image_sizes")
         output = self.llm.generate(
             inputs_embeds=inputs_embeds,
-            #pad_token_id=0,
             eos_token_id=terminators,
             attention_mask=attention_mask,
-            **kwargs
         )
         if decode_text:
             return self._decode_text(output, tokenizer)
@@ -200,16 +202,16 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
         generation_kwargs = {
-            'inputs_embeds': inputs_embeds,
-            'pad_token_id': 0,
-            'eos_token_id': terminators,
-            'streamer': streamer
         }
         generation_kwargs.update(kwargs)
         thread = Thread(target=self.llm.generate, kwargs=generation_kwargs)
         thread.start()
         return streamer
     def _decode_text(self, result_ids, tokenizer):
@@ -236,7 +238,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         return_vision_hidden_states=False,
         stream=False,
         decode_text=False,
-        **kwargs
     ):
         assert input_ids is not None
         assert len(input_ids) == len(pixel_values)
@@ -248,7 +250,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         if vision_hidden_states is None:
             model_inputs["pixel_values"] = pixel_values
-            model_inputs['tgt_sizes'] = tgt_sizes
         else:
             model_inputs["vision_hidden_states"] = vision_hidden_states
@@ -261,11 +263,13 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             if stream:
                 result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
             else:
-                result = self._decode(model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs)
         if return_vision_hidden_states:
             return result, vision_hidden_states
         return result
     def chat(
@@ -279,11 +283,11 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
         min_new_tokens=0,
         sampling=True,
         max_inp_length=8192,
-        system_prompt='',
         stream=False,
         max_slice_nums=None,
         use_image_id=None,
-        **kwargs
     ):
         if isinstance(msgs[0], list):
             batched = True
@@ -291,7 +295,7 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             batched = False
         msgs_list = msgs
         images_list = image
         if batched is False:
             images_list, msgs_list = [images_list], [msgs_list]
         else:
@@ -303,12 +307,22 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
             if self.processor is None:
                 self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
             processor = self.processor
-        assert self.config.query_num == processor.image_processor.image_feature_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.patch_size == processor.image_processor.patch_size, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.use_image_id == processor.image_processor.use_image_id, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
-        assert self.config.slice_mode == processor.image_processor.slice_mode, "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
         prompts_lists = []
         input_images_lists = []
@@ -342,19 +356,21 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 msg["content"] = "\n".join(cur_msgs)
             if system_prompt:
-                sys_msg = {'role': 'system', 'content': system_prompt}
-                copy_msgs = [sys_msg] + copy_msgs
-            prompts_lists.append(processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True))
             input_images_lists.append(images)
         inputs = processor(
-            prompts_lists,
-            input_images_lists,
             max_slice_nums=max_slice_nums,
             use_image_id=use_image_id,
-            return_tensors="pt",
-            max_length=max_inp_length
         ).to(self.device)
         if sampling:
@@ -363,20 +379,18 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 "top_k": 100,
                 "temperature": 0.7,
                 "do_sample": True,
-                "repetition_penalty": 1.05
             }
         else:
             generation_config = {
                 "num_beams": 3,
                 "repetition_penalty": 1.2,
             }
         if min_new_tokens > 0:
-            generation_config['min_new_tokens'] = min_new_tokens
-        generation_config.update(
-            (k, kwargs[k]) for k in generation_config.keys() & kwargs.keys()
-        )
         inputs.pop("image_sizes")
         with torch.inference_mode():
@@ -387,15 +401,17 @@ class MiniCPMV(MiniCPMVPreTrainedModel):
                 vision_hidden_states=vision_hidden_states,
                 stream=stream,
                 decode_text=True,
-                **generation_config
             )
         if stream:
             def stream_gen():
                 for text in res:
                     for term in self.terminators:
-                        text = text.replace(term, '')
                     yield text
             return stream_gen()
         else:

 import json
+import math
 from copy import deepcopy
+from threading import Thread
+import torch
 from PIL import Image
+from transformers import AutoProcessor, Qwen2ForCausalLM, Qwen2PreTrainedModel, TextIteratorStreamer
 from .configuration_minicpm import MiniCPMVConfig
 from .modeling_navit_siglip import SiglipVisionTransformer
 from .resampler import Resampler
 class MiniCPMVPreTrainedModel(Qwen2PreTrainedModel):
     config_class = MiniCPMVConfig
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
         self.processor = None
+        self.terminators = ["<|im_end|>", "<|endoftext|>"]
     def init_vision_module(self):
         # same as HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit add tgt_sizes
+        if self.config._attn_implementation == "flash_attention_2":
+            self.config.vision_config._attn_implementation = "flash_attention_2"
         else:
             # not suport sdpa
+            self.config.vision_config._attn_implementation = "eager"
         model = SiglipVisionTransformer(self.config.vision_config)
         if self.config.drop_vision_last_layer:
             model.encoder.layers = model.encoder.layers[:-1]
+        setattr(model, "embed_dim", model.embeddings.embed_dim)
+        setattr(model, "patch_size", model.embeddings.patch_size)
         return model
             embed_dim=embed_dim,
             num_heads=embed_dim // 128,
             kv_dim=vision_dim,
+            adaptive=True,
         )
     def get_input_embeddings(self):
         return self.llm
     def get_vllm_embedding(self, data):
+        if "vision_hidden_states" not in data:
             dtype = self.llm.model.embed_tokens.weight.dtype
             device = self.llm.model.embed_tokens.weight.device
+            tgt_sizes = data["tgt_sizes"]
+            pixel_values_list = data["pixel_values"]
             vision_hidden_states = []
             all_pixel_values = []
             img_cnt = []
                 max_patches = torch.max(tgt_sizes[:, 0] * tgt_sizes[:, 1])
+                all_pixel_values = torch.nn.utils.rnn.pad_sequence(
+                    all_pixel_values, batch_first=True, padding_value=0.0
+                )
                 B, L, _ = all_pixel_values.shape
                 all_pixel_values = all_pixel_values.permute(0, 2, 1).reshape(B, 3, -1, L)
                 patch_attn_mask = torch.zeros((B, 1, max_patches), dtype=torch.bool, device=device)
                 for i in range(B):
+                    patch_attn_mask[i, 0, : tgt_sizes[i][0] * tgt_sizes[i][1]] = True
                 vision_batch_size = self.config.vision_batch_size
                 all_pixel_values = all_pixel_values.type(dtype)
                     for i in range(0, B, vision_batch_size):
                         start_idx = i
                         end_idx = i + vision_batch_size
+                        tmp_hs = self.vpm(
+                            all_pixel_values[start_idx:end_idx],
+                            patch_attention_mask=patch_attn_mask[start_idx:end_idx],
+                            tgt_sizes=tgt_sizes[start_idx:end_idx],
+                        ).last_hidden_state
                         hs.append(tmp_hs)
                     vision_embedding = torch.cat(hs, dim=0)
                 else:
+                    vision_embedding = self.vpm(
+                        all_pixel_values, patch_attention_mask=patch_attn_mask, tgt_sizes=tgt_sizes
+                    ).last_hidden_state
                 vision_embedding = self.resampler(vision_embedding, tgt_sizes)
                 start = 0
                 for pixel_values in pixel_values_list:
                     img_cnt = len(pixel_values)
                     if img_cnt > 0:
+                        vision_hidden_states.append(vision_embedding[start : start + img_cnt])
                         start += img_cnt
                     else:
                         vision_hidden_states.append([])
+            else:  # no image
                 if self.training:
+                    dummy_image = torch.zeros((1, 3, 224, 224), device=device, dtype=dtype)
+                    tgt_sizes = torch.Tensor(
+                        [[(224 // self.config.patch_size), math.ceil(224 / self.config.patch_size)]]
+                    ).type(torch.int32)
                     dummy_feature = self.resampler(self.vpm(dummy_image).last_hidden_state, tgt_sizes)
                 else:
                     dummy_feature = []
                     vision_hidden_states.append(dummy_feature)
         else:
+            vision_hidden_states = data["vision_hidden_states"]
+        if hasattr(self.llm.config, "scale_emb"):
+            vllm_embedding = self.llm.model.embed_tokens(data["input_ids"]) * self.llm.config.scale_emb
         else:
+            vllm_embedding = self.llm.model.embed_tokens(data["input_ids"])
+        vision_hidden_states = [
+            i.type(vllm_embedding.dtype) if isinstance(i, torch.Tensor) else i for i in vision_hidden_states
+        ]
+        bs = len(data["input_ids"])
         for i in range(bs):
             cur_vs_hs = vision_hidden_states[i]
             if len(cur_vs_hs) > 0:
                 cur_vllm_emb = vllm_embedding[i]
+                cur_image_bound = data["image_bound"][i]
                 if len(cur_image_bound) > 0:
                     image_indices = torch.stack(
                         [torch.arange(r[0], r[1], dtype=torch.long) for r in cur_image_bound]
                     ).to(vllm_embedding.device)
+                    cur_vllm_emb.scatter_(
+                        0,
+                        image_indices.view(-1, 1).repeat(1, cur_vllm_emb.shape[-1]),
+                        cur_vs_hs.view(-1, cur_vs_hs.shape[-1]),
+                    )
                 elif self.training:
                     cur_vllm_emb += cur_vs_hs[0].mean() * 0
         if position_ids.dtype != torch.int64:
             position_ids = position_ids.long()
+        return self.llm(input_ids=None, position_ids=position_ids, inputs_embeds=vllm_embedding, **kwargs)
     def _decode(self, inputs_embeds, tokenizer, attention_mask, decode_text=False, **kwargs):
         terminators = None
         if tokenizer is not None:
         kwargs.pop("image_sizes")
         output = self.llm.generate(
             inputs_embeds=inputs_embeds,
+            # pad_token_id=0,
             eos_token_id=terminators,
             attention_mask=attention_mask,
+            **kwargs,
         )
         if decode_text:
             return self._decode_text(output, tokenizer)
         terminators = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
         streamer = TextIteratorStreamer(tokenizer=tokenizer)
         generation_kwargs = {
+            "inputs_embeds": inputs_embeds,
+            "pad_token_id": 0,
+            "eos_token_id": terminators,
+            "streamer": streamer,
         }
         generation_kwargs.update(kwargs)
         thread = Thread(target=self.llm.generate, kwargs=generation_kwargs)
         thread.start()
         return streamer
     def _decode_text(self, result_ids, tokenizer):
         return_vision_hidden_states=False,
         stream=False,
         decode_text=False,
+        **kwargs,
     ):
         assert input_ids is not None
         assert len(input_ids) == len(pixel_values)
         if vision_hidden_states is None:
             model_inputs["pixel_values"] = pixel_values
+            model_inputs["tgt_sizes"] = tgt_sizes
         else:
             model_inputs["vision_hidden_states"] = vision_hidden_states
             if stream:
                 result = self._decode_stream(model_inputs["inputs_embeds"], tokenizer, **kwargs)
             else:
+                result = self._decode(
+                    model_inputs["inputs_embeds"], tokenizer, attention_mask, decode_text=decode_text, **kwargs
+                )
         if return_vision_hidden_states:
             return result, vision_hidden_states
         return result
     def chat(
         min_new_tokens=0,
         sampling=True,
         max_inp_length=8192,
+        system_prompt="",
         stream=False,
         max_slice_nums=None,
         use_image_id=None,
+        **kwargs,
     ):
         if isinstance(msgs[0], list):
             batched = True
             batched = False
         msgs_list = msgs
         images_list = image
         if batched is False:
             images_list, msgs_list = [images_list], [msgs_list]
         else:
             if self.processor is None:
                 self.processor = AutoProcessor.from_pretrained(self.config._name_or_path, trust_remote_code=True)
             processor = self.processor
+        assert (
+            self.config.query_num == processor.image_processor.image_feature_size
+        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert (
+            self.config.patch_size == processor.image_processor.patch_size
+        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert (
+            self.config.use_image_id == processor.image_processor.use_image_id
+        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert (
+            self.config.slice_config.max_slice_nums == processor.image_processor.max_slice_nums
+        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
+        assert (
+            self.config.slice_mode == processor.image_processor.slice_mode
+        ), "These two values should be the same. Check `config.json` and `preprocessor_config.json`."
         prompts_lists = []
         input_images_lists = []
                 msg["content"] = "\n".join(cur_msgs)
             if system_prompt:
+                sys_msg = {"role": "system", "content": system_prompt}
+                copy_msgs = [sys_msg] + copy_msgs
+            prompts_lists.append(
+                processor.tokenizer.apply_chat_template(copy_msgs, tokenize=False, add_generation_prompt=True)
+            )
             input_images_lists.append(images)
         inputs = processor(
+            prompts_lists,
+            input_images_lists,
             max_slice_nums=max_slice_nums,
             use_image_id=use_image_id,
+            return_tensors="pt",
+            max_length=max_inp_length,
         ).to(self.device)
         if sampling:
                 "top_k": 100,
                 "temperature": 0.7,
                 "do_sample": True,
+                "repetition_penalty": 1.05,
             }
         else:
             generation_config = {
                 "num_beams": 3,
                 "repetition_penalty": 1.2,
             }
         if min_new_tokens > 0:
+            generation_config["min_new_tokens"] = min_new_tokens
+        generation_config.update((k, kwargs[k]) for k in generation_config.keys() & kwargs.keys())
         inputs.pop("image_sizes")
         with torch.inference_mode():
                 vision_hidden_states=vision_hidden_states,
                 stream=stream,
                 decode_text=True,
+                **generation_config,
             )
         if stream:
             def stream_gen():
                 for text in res:
                     for term in self.terminators:
+                        text = text.replace(term, "")
                     yield text
             return stream_gen()
         else:

modeling_navit_siglip.py CHANGED Viewed

@@ -16,11 +16,11 @@
 # Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
-import os
 import math
 import warnings
 from dataclasses import dataclass
-from typing import Any, Optional, Tuple, Union
 import numpy as np
 import torch
@@ -28,12 +28,11 @@ import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
 from transformers.activations import ACT2FN
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_utils import PreTrainedModel
-from transformers.configuration_utils import PretrainedConfig
 from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
@@ -42,10 +41,11 @@ from transformers.utils import (
     logging,
     replace_return_docstrings,
 )
-from transformers.utils import logging
 logger = logging.get_logger(__name__)
 class SiglipVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
@@ -133,7 +133,7 @@ class SiglipVisionConfig(PretrainedConfig):
             )
         return cls.from_dict(config_dict, **kwargs)
 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
@@ -148,7 +148,6 @@ try:
         from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except:
     pass
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
@@ -318,7 +317,12 @@ class SiglipVisionEmbeddings(nn.Module):
         self.num_positions = self.num_patches
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-    def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor, tgt_sizes: Optional[torch.IntTensor]=None) -> torch.Tensor:
         batch_size = pixel_values.size(0)
         patch_embeds = self.patch_embedding(pixel_values)
@@ -643,11 +647,7 @@ class SiglipEncoderLayer(nn.Module):
         super().__init__()
         self.embed_dim = config.hidden_size
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
-        self.self_attn = (
-            SiglipAttention(config)
-            if not self._use_flash_attention_2
-            else SiglipFlashAttention2(config)
-        )
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
@@ -847,9 +847,9 @@ class SiglipEncoder(nn.Module):
             last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
 @add_start_docstrings(
-    """The vision model from SigLIP without any head or projection on top.""",
-    SIGLIP_START_DOCSTRING
 )
 class SiglipVisionTransformer(SiglipPreTrainedModel):
     config_class = SiglipVisionConfig
@@ -904,14 +904,16 @@ class SiglipVisionTransformer(SiglipPreTrainedModel):
                 device=pixel_values.device,
             )
-        hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes)
         patch_attention_mask = patch_attention_mask.view(batch_size, -1)
         # The call to `_upad_input` in `_flash_attention_forward` is expensive
         # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
         # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
         if not torch.any(~patch_attention_mask):
-            attention_mask=None
         else:
             attention_mask = (
                 _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)

 # Copied from  HuggingFaceM4/siglip-so400m-14-980-flash-attn2-navit and add tgt_sizes
 import math
+import os
 import warnings
 from dataclasses import dataclass
+from typing import Optional, Tuple, Union
 import numpy as np
 import torch
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn.init import _calculate_fan_in_and_fan_out
 from transformers.activations import ACT2FN
+from transformers.configuration_utils import PretrainedConfig
 from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
 from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
 from transformers.modeling_utils import PreTrainedModel
 from transformers.utils import (
     ModelOutput,
     add_start_docstrings,
     logging,
     replace_return_docstrings,
 )
 logger = logging.get_logger(__name__)
 class SiglipVisionConfig(PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`SiglipVisionModel`]. It is used to instantiate a
             )
         return cls.from_dict(config_dict, **kwargs)
 _CHECKPOINT_FOR_DOC = "google/siglip-base-patch16-224"
         from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
 except:
     pass
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
         self.num_positions = self.num_patches
         self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        patch_attention_mask: torch.BoolTensor,
+        tgt_sizes: Optional[torch.IntTensor] = None,
+    ) -> torch.Tensor:
         batch_size = pixel_values.size(0)
         patch_embeds = self.patch_embedding(pixel_values)
         super().__init__()
         self.embed_dim = config.hidden_size
         self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.self_attn = SiglipAttention(config) if not self._use_flash_attention_2 else SiglipFlashAttention2(config)
         self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
         self.mlp = SiglipMLP(config)
         self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
             last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
         )
 @add_start_docstrings(
+    """The vision model from SigLIP without any head or projection on top.""", SIGLIP_START_DOCSTRING
 )
 class SiglipVisionTransformer(SiglipPreTrainedModel):
     config_class = SiglipVisionConfig
                 device=pixel_values.device,
             )
+        hidden_states = self.embeddings(
+            pixel_values=pixel_values, patch_attention_mask=patch_attention_mask, tgt_sizes=tgt_sizes
+        )
         patch_attention_mask = patch_attention_mask.view(batch_size, -1)
         # The call to `_upad_input` in `_flash_attention_forward` is expensive
         # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
         # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
         if not torch.any(~patch_attention_mask):
+            attention_mask = None
         else:
             attention_mask = (
                 _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)

processing_minicpmv.py CHANGED Viewed

@@ -16,15 +16,14 @@
 Processor class for MiniCPMV.
 """
-from typing import List, Optional, Union, Dict, Any
-import torch
 import re
-from transformers.image_processing_utils import BatchFeature
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
-from transformers.utils import TensorType, requires_backends, is_torch_dtype, is_torch_device
 from .image_processing_minicpmv import MiniCPMVBatchFeature
@@ -49,7 +48,7 @@ class MiniCPMVProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
         self.version = image_processor.version
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
@@ -59,14 +58,23 @@ class MiniCPMVProcessor(ProcessorMixin):
         max_slice_nums: int = None,
         use_image_id: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
-        **kwargs
     ) -> MiniCPMVBatchFeature:
         image_inputs = None
         if images is not None:
-            image_inputs = self.image_processor(images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors)
-        return self._convert_images_texts_to_inputs(image_inputs, text, max_slice_nums=max_slice_nums, use_image_id=use_image_id, max_length=max_length, **kwargs, return_tensors=return_tensors)
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
@@ -84,7 +92,7 @@ class MiniCPMVProcessor(ProcessorMixin):
             result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
         return result_text
         # return self.tokenizer.batch_decode(*args, **kwargs)
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
     def decode(self, *args, **kwargs):
         """
@@ -95,13 +103,13 @@ class MiniCPMVProcessor(ProcessorMixin):
         result = result[result != 0]
         if result[0] == self.tokenizer.bos_id:
             result = result[1:]
-        if result[-1] == self.tokenizer.eos_id or (hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id):
             result = result[:-1]
         return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
-    def _convert(
-        self, input_str, max_inp_length: Optional[int] = None
-    ):
         if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
             input_ids = self.tokenizer.encode(input_str)
         else:
@@ -128,23 +136,25 @@ class MiniCPMVProcessor(ProcessorMixin):
         return input_ids, image_bounds
     def _convert_images_texts_to_inputs(
-            self,
-            images,
-            texts: Union[str, List[str]],
-            truncation=None,
-            max_length=None,
-            max_slice_nums=None,
-            use_image_id=None,
-            return_tensors=None,
-            **kwargs
-        ):
         if images is None or not len(images):
-            model_inputs = self.tokenizer(texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs)
             return MiniCPMVBatchFeature(data={**model_inputs})
         pattern = "(<image>./</image>)"
         images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
         if isinstance(texts, str):
             texts = [texts]
         input_ids_list = []
@@ -155,33 +165,32 @@ class MiniCPMVProcessor(ProcessorMixin):
             text_chunks = text.split(pattern)
             final_text = ""
             for i in range(len(image_tags)):
-                final_text = final_text + text_chunks[i] + \
-                    self.image_processor.get_slice_image_placeholder(
-                        image_sizes[index][i],
-                        i,
-                        max_slice_nums,
-                        use_image_id
                     )
             final_text += text_chunks[-1]
             input_ids, image_bounds = self._convert(final_text, max_length)
             input_ids_list.append(input_ids)
             image_bounds_list.append(image_bounds)
-        padded_input_ids, padding_lengths = self.pad(
-            input_ids_list,
-            padding_side="left"
-        )
         for i, length in enumerate(padding_lengths):
             image_bounds_list[i] = image_bounds_list[i] + length
         attention_mask = padded_input_ids.ne(0)
-        return MiniCPMVBatchFeature(data={
-            "input_ids": padded_input_ids,
-            "attention_mask": attention_mask,
-            "pixel_values": images,
-            "image_sizes": image_sizes,
-            "image_bound": image_bounds_list,
-            "tgt_sizes": tgt_sizes
-        })
     @property
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
@@ -190,7 +199,6 @@ class MiniCPMVProcessor(ProcessorMixin):
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
     def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         items = []
         if isinstance(inputs[0], list):
@@ -219,10 +227,7 @@ class MiniCPMVProcessor(ProcessorMixin):
                 return torch.stack([item for item in items], dim=0), [0] * batch_size
             tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
         else:
-            tensor = (
-                torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype)
-                + padding_value
-            )
         padding_length = []
         for i, item in enumerate(items):

 Processor class for MiniCPMV.
 """
 import re
+from typing import List, Optional, Union
+import torch
 from transformers.image_utils import ImageInput
 from transformers.processing_utils import ProcessorMixin
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import TensorType
 from .image_processing_minicpmv import MiniCPMVBatchFeature
     def __init__(self, image_processor=None, tokenizer=None):
         super().__init__(image_processor, tokenizer)
         self.version = image_processor.version
     def __call__(
         self,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]],
         max_slice_nums: int = None,
         use_image_id: bool = None,
         return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+        **kwargs,
     ) -> MiniCPMVBatchFeature:
         image_inputs = None
         if images is not None:
+            image_inputs = self.image_processor(
+                images, do_pad=do_pad, max_slice_nums=max_slice_nums, return_tensors=return_tensors
+            )
+        return self._convert_images_texts_to_inputs(
+            image_inputs,
+            text,
+            max_slice_nums=max_slice_nums,
+            use_image_id=use_image_id,
+            max_length=max_length,
+            **kwargs,
+            return_tensors=return_tensors,
+        )
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
     def batch_decode(self, *args, **kwargs):
         """
             result_text.append(self.tokenizer.decode(result, *args[1:], **kwargs).strip())
         return result_text
         # return self.tokenizer.batch_decode(*args, **kwargs)
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
     def decode(self, *args, **kwargs):
         """
         result = result[result != 0]
         if result[0] == self.tokenizer.bos_id:
             result = result[1:]
+        if result[-1] == self.tokenizer.eos_id or (
+            hasattr(self.tokenizer, "eot_id") and result[-1] == self.tokenizer.eot_id
+        ):
             result = result[:-1]
         return self.tokenizer.decode(result, *args[1:], **kwargs).strip()
+    def _convert(self, input_str, max_inp_length: Optional[int] = None):
         if self.version > 2.5 or not getattr(self.tokenizer, "add_bos_token", False):
             input_ids = self.tokenizer.encode(input_str)
         else:
         return input_ids, image_bounds
     def _convert_images_texts_to_inputs(
+        self,
+        images,
+        texts: Union[str, List[str]],
+        truncation=None,
+        max_length=None,
+        max_slice_nums=None,
+        use_image_id=None,
+        return_tensors=None,
+        **kwargs,
+    ):
         if images is None or not len(images):
+            model_inputs = self.tokenizer(
+                texts, return_tensors=return_tensors, truncation=truncation, max_length=max_length, **kwargs
+            )
             return MiniCPMVBatchFeature(data={**model_inputs})
         pattern = "(<image>./</image>)"
         images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
         if isinstance(texts, str):
             texts = [texts]
         input_ids_list = []
             text_chunks = text.split(pattern)
             final_text = ""
             for i in range(len(image_tags)):
+                final_text = (
+                    final_text
+                    + text_chunks[i]
+                    + self.image_processor.get_slice_image_placeholder(
+                        image_sizes[index][i], i, max_slice_nums, use_image_id
                     )
+                )
             final_text += text_chunks[-1]
             input_ids, image_bounds = self._convert(final_text, max_length)
             input_ids_list.append(input_ids)
             image_bounds_list.append(image_bounds)
+        padded_input_ids, padding_lengths = self.pad(input_ids_list, padding_side="left")
         for i, length in enumerate(padding_lengths):
             image_bounds_list[i] = image_bounds_list[i] + length
         attention_mask = padded_input_ids.ne(0)
+        return MiniCPMVBatchFeature(
+            data={
+                "input_ids": padded_input_ids,
+                "attention_mask": attention_mask,
+                "pixel_values": images,
+                "image_sizes": image_sizes,
+                "image_bound": image_bounds_list,
+                "tgt_sizes": tgt_sizes,
+            }
+        )
     @property
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
         image_processor_input_names = self.image_processor.model_input_names
         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
     def pad(self, inputs, max_length=None, padding_value=0, padding_side="left"):
         items = []
         if isinstance(inputs[0], list):
                 return torch.stack([item for item in items], dim=0), [0] * batch_size
             tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
         else:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
         padding_length = []
         for i, item in enumerate(items):

resampler.py CHANGED Viewed

@@ -1,18 +1,17 @@
 from functools import partial
 from typing import Optional, Tuple
-import numpy as np
-import warnings
 import torch
-from torch import nn
-from torch import Tensor
 import torch.nn.functional as F
 from torch.nn.functional import *
 from torch.nn.modules.activation import *
-from torch.nn.init import trunc_normal_, constant_, xavier_normal_, xavier_uniform_
 from transformers.integrations import is_deepspeed_zero3_enabled
 def get_2d_sincos_pos_embed(embed_dim, image_size):
     """
     image_size: image_size or (image_height, image_width)
@@ -52,10 +51,10 @@ def get_1d_sincos_pos_embed_from_grid_new(embed_dim, pos):
     """
     assert embed_dim % 2 == 0
     omega = np.arange(embed_dim // 2, dtype=np.float32)
-    omega /= embed_dim / 2.
-    omega = 1. / 10000 ** omega  # (D/2,)
-    out = np.einsum('hw,d->hwd', pos, omega)  # (H, W, D/2), outer product
     emb_sin = np.sin(out)  # (H, W, D/2)
     emb_cos = np.cos(out)  # (H, W, D/2)
@@ -73,14 +72,14 @@ class Resampler(nn.Module):
     """
     def __init__(
-            self,
-            num_queries,
-            embed_dim,
-            num_heads,
-            kv_dim=None,
-            norm_layer=partial(nn.LayerNorm, eps=1e-6),
-            adaptive=False,
-            max_size=(70, 70),
     ):
         super().__init__()
         self.num_queries = num_queries
@@ -101,13 +100,13 @@ class Resampler(nn.Module):
         self.ln_kv = norm_layer(embed_dim)
         self.ln_post = norm_layer(embed_dim)
-        self.proj = nn.Parameter((embed_dim ** -0.5) * torch.randn(embed_dim, embed_dim))
         self._set_2d_pos_cache(self.max_size)
-    def _set_2d_pos_cache(self, max_size, device='cpu'):
         if is_deepspeed_zero3_enabled():
-            device='cuda'
         pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
@@ -120,7 +119,7 @@ class Resampler(nn.Module):
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
-            trunc_normal_(m.weight, std=.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
@@ -145,10 +144,11 @@ class Resampler(nn.Module):
         for i in range(bs):
             tgt_h, tgt_w = tgt_sizes[i]
             pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype))  # patches * D
-            key_padding_mask[i, patch_len[i]:] = True
-        pos_embed = torch.nn.utils.rnn.pad_sequence(
-            pos_embed, batch_first=True, padding_value=0.0).permute(1, 0, 2)  # BLD => L * B * D
         x = self.kv_proj(x)  # B * L * D
         x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
@@ -159,7 +159,8 @@ class Resampler(nn.Module):
             self._repeat(q, bs),  # Q * B * D
             x + pos_embed,  # L * B * D +  L * B * D
             x,
-            key_padding_mask=key_padding_mask)[0]
         #  out: Q * B * D
         x = out.permute(1, 0, 2)  # B * Q * D
@@ -172,26 +173,44 @@ class Resampler(nn.Module):
 class MultiheadAttention(nn.MultiheadAttention):
-    def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False,
-                 add_zero_attn=False, kdim=None, vdim=None, batch_first=False, device=None, dtype=None):
-        super().__init__(embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first, device, dtype)
         # rewrite out_proj layer，with nn.Linear
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
     def forward(
-                self,
-                query: Tensor,
-                key: Tensor,
-                value: Tensor,
-                key_padding_mask: Optional[Tensor] = None,
-                need_weights: bool = True,
-                attn_mask: Optional[Tensor] = None,
-                average_attn_weights: bool = True,
-                is_causal : bool = False) -> Tuple[Tensor, Optional[Tensor]]:
-        why_not_fast_path = ''
-        if ((attn_mask is not None and torch.is_floating_point(attn_mask))
-           or (key_padding_mask is not None) and torch.is_floating_point(key_padding_mask)):
             why_not_fast_path = "floating-point masks are not supported for fast path."
         is_batched = query.dim() == 3
@@ -201,7 +220,7 @@ class MultiheadAttention(nn.MultiheadAttention):
             mask_name="key_padding_mask",
             other_type=F._none_or_dtype(attn_mask),
             other_name="attn_mask",
-            target_type=query.dtype
         )
         attn_mask = _canonical_mask(
@@ -213,7 +232,6 @@ class MultiheadAttention(nn.MultiheadAttention):
             check_other=False,
         )
         if not is_batched:
             why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
         elif query is not key or key is not value:
@@ -222,12 +240,16 @@ class MultiheadAttention(nn.MultiheadAttention):
             # they don't!
             why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
         elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
         elif self.in_proj_weight is None:
             why_not_fast_path = "in_proj_weight was None"
         elif query.dtype != self.in_proj_weight.dtype:
             # this case will fail anyway, but at least they'll get a useful error message.
-            why_not_fast_path = f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
         elif self.training:
             why_not_fast_path = "training is enabled"
         elif (self.num_heads % 2) != 0:
@@ -265,11 +287,15 @@ class MultiheadAttention(nn.MultiheadAttention):
             elif _is_make_fx_tracing():
                 why_not_fast_path = "we are running make_fx tracing"
             elif not all(_check_arg_device(x) for x in tensor_args):
-                why_not_fast_path = ("some Tensor argument's device is neither one of "
-                                     f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}")
             elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args):
-                why_not_fast_path = ("grad is enabled and at least one of query or the "
-                                     "input/output projection weights or biases requires_grad")
             if not why_not_fast_path:
                 merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
@@ -287,11 +313,14 @@ class MultiheadAttention(nn.MultiheadAttention):
                         merged_mask,
                         need_weights,
                         average_attn_weights,
-                        mask_type)
         any_nested = query.is_nested or key.is_nested or value.is_nested
-        assert not any_nested, ("MultiheadAttention does not support NestedTensor outside of its fast path. " +
-                                f"The fast path was not hit because {why_not_fast_path}")
         if self.batch_first and is_batched:
             # make sure that the transpose op does not affect the "is" property
@@ -303,38 +332,60 @@ class MultiheadAttention(nn.MultiheadAttention):
                     value = key
             else:
                 query, key, value = (x.transpose(1, 0) for x in (query, key, value))
         if not self._qkv_same_embed_dim:
             attn_output, attn_output_weights = self.multi_head_attention_forward(
-                query, key, value, self.embed_dim, self.num_heads,
-                self.in_proj_weight, self.in_proj_bias,
-                self.bias_k, self.bias_v, self.add_zero_attn,
-                self.dropout, self.out_proj.weight, self.out_proj.bias,
                 training=self.training,
-                key_padding_mask=key_padding_mask, need_weights=need_weights,
                 attn_mask=attn_mask,
                 use_separate_proj_weight=True,
-                q_proj_weight=self.q_proj_weight, k_proj_weight=self.k_proj_weight,
                 v_proj_weight=self.v_proj_weight,
                 average_attn_weights=average_attn_weights,
-                is_causal=is_causal)
         else:
             attn_output, attn_output_weights = self.multi_head_attention_forward(
-                query, key, value, self.embed_dim, self.num_heads,
-                self.in_proj_weight, self.in_proj_bias,
-                self.bias_k, self.bias_v, self.add_zero_attn,
-                self.dropout, self.out_proj.weight, self.out_proj.bias,
                 training=self.training,
                 key_padding_mask=key_padding_mask,
                 need_weights=need_weights,
                 attn_mask=attn_mask,
                 average_attn_weights=average_attn_weights,
-                is_causal=is_causal)
         if self.batch_first and is_batched:
             return attn_output.transpose(1, 0), attn_output_weights
         else:
             return attn_output, attn_output_weights
     def multi_head_attention_forward(
         self,
         query: Tensor,
@@ -364,9 +415,9 @@ class MultiheadAttention(nn.MultiheadAttention):
         is_causal: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
         is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
         # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
         # is batched, run the computation and before returning squeeze the
         # batch dimension so that the output doesn't carry this temporary batch dimension.
@@ -377,26 +428,26 @@ class MultiheadAttention(nn.MultiheadAttention):
             value = value.unsqueeze(1)
             if key_padding_mask is not None:
                 key_padding_mask = key_padding_mask.unsqueeze(0)
         # set up shape vars
         tgt_len, bsz, embed_dim = query.shape
         src_len, _, _ = key.shape
         key_padding_mask = _canonical_mask(
             mask=key_padding_mask,
             mask_name="key_padding_mask",
             other_type=_none_or_dtype(attn_mask),
             other_name="attn_mask",
-            target_type=query.dtype
         )
         if is_causal and attn_mask is None:
             raise RuntimeError(
                 "Need attn_mask if specifying the is_causal hint. "
                 "You may use the Transformer module method "
                 "`generate_square_subsequent_mask` to create this mask."
             )
         if is_causal and key_padding_mask is None and not need_weights:
             # when we have a kpm or need weights, we need attn_mask
             # Otherwise, we use the is_causal hint go as is_causal
@@ -411,28 +462,30 @@ class MultiheadAttention(nn.MultiheadAttention):
                 target_type=query.dtype,
                 check_other=False,
             )
             if key_padding_mask is not None:
                 # We have the attn_mask, and use that to merge kpm into it.
                 # Turn off use of is_causal hint, as the merged mask is no
                 # longer causal.
                 is_causal = False
-        assert embed_dim == embed_dim_to_check, \
-            f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
         if isinstance(embed_dim, torch.Tensor):
             # embed_dim can be a tensor when JIT tracing
-            head_dim = embed_dim.div(num_heads, rounding_mode='trunc')
         else:
             head_dim = embed_dim // num_heads
         assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
         if use_separate_proj_weight:
             # allow MHA to have different embedding dimensions when separate projection weights are used
-            assert key.shape[:2] == value.shape[:2], \
-                f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
         else:
             assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
         #
         # compute in-projection
         #
@@ -448,23 +501,27 @@ class MultiheadAttention(nn.MultiheadAttention):
             else:
                 b_q, b_k, b_v = in_proj_bias.chunk(3)
             q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
         # prep attention mask
         if attn_mask is not None:
             # ensure attn_mask's dim is 3
             if attn_mask.dim() == 2:
                 correct_2d_size = (tgt_len, src_len)
                 if attn_mask.shape != correct_2d_size:
-                    raise RuntimeError(f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}.")
                 attn_mask = attn_mask.unsqueeze(0)
             elif attn_mask.dim() == 3:
                 correct_3d_size = (bsz * num_heads, tgt_len, src_len)
                 if attn_mask.shape != correct_3d_size:
-                    raise RuntimeError(f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}.")
             else:
                 raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
         # add bias along batch dimension (currently second)
         if bias_k is not None and bias_v is not None:
             assert static_k is None, "bias cannot be added to static key."
@@ -478,7 +535,7 @@ class MultiheadAttention(nn.MultiheadAttention):
         else:
             assert bias_k is None
             assert bias_v is None
         #
         # reshape q, k, v for multihead attention and make em batch first
         #
@@ -487,21 +544,25 @@ class MultiheadAttention(nn.MultiheadAttention):
             k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         else:
             # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-            assert static_k.size(0) == bsz * num_heads, \
-                f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
-            assert static_k.size(2) == head_dim, \
-                f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
             k = static_k
         if static_v is None:
             v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         else:
             # TODO finish disentangling control flow so we don't do in-projections when statics are passed
-            assert static_v.size(0) == bsz * num_heads, \
-                f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
-            assert static_v.size(2) == head_dim, \
-                f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
             v = static_v
         # add zero attention along batch dimension (now first)
         if add_zero_attn:
             zero_attn_shape = (bsz * num_heads, 1, head_dim)
@@ -511,35 +572,40 @@ class MultiheadAttention(nn.MultiheadAttention):
                 attn_mask = pad(attn_mask, (0, 1))
             if key_padding_mask is not None:
                 key_padding_mask = pad(key_padding_mask, (0, 1))
         # update source sequence length after adjustments
         src_len = k.size(1)
         # merge key padding and attention masks
         if key_padding_mask is not None:
-            assert key_padding_mask.shape == (bsz, src_len), \
-                f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
-            key_padding_mask = key_padding_mask.view(bsz, 1, 1, src_len).   \
-                expand(-1, num_heads, -1, -1).reshape(bsz * num_heads, 1, src_len)
             if attn_mask is None:
                 attn_mask = key_padding_mask
             else:
                 attn_mask = attn_mask + key_padding_mask
         # adjust dropout probability
         if not training:
             dropout_p = 0.0
         #
         # (deep breath) calculate attention and out projection
         #
         if need_weights:
             B, Nt, E = q.shape
             q_scaled = q / math.sqrt(E)
             assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
             if attn_mask is not None:
                 attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
             else:
@@ -547,18 +613,18 @@ class MultiheadAttention(nn.MultiheadAttention):
             attn_output_weights = softmax(attn_output_weights, dim=-1)
             if dropout_p > 0.0:
                 attn_output_weights = dropout(attn_output_weights, p=dropout_p)
             attn_output = torch.bmm(attn_output_weights, v)
             attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
             attn_output = self.out_proj(attn_output)
             attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
             # optionally average attention weights over heads
             attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
             if average_attn_weights:
                 attn_output_weights = attn_output_weights.mean(dim=1)
             if not is_batched:
                 # squeeze the output if input was unbatched
                 attn_output = attn_output.squeeze(1)
@@ -573,14 +639,14 @@ class MultiheadAttention(nn.MultiheadAttention):
                     attn_mask = attn_mask.unsqueeze(0)
                 else:
                     attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
             q = q.view(bsz, num_heads, tgt_len, head_dim)
             k = k.view(bsz, num_heads, src_len, head_dim)
             v = v.view(bsz, num_heads, src_len, head_dim)
             attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
             attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
             attn_output = self.out_proj(attn_output)
             attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
             if not is_batched:
@@ -589,8 +655,14 @@ class MultiheadAttention(nn.MultiheadAttention):
             return attn_output, None
-def _mha_shape_check(query: Tensor, key: Tensor, value: Tensor,
-                     key_padding_mask: Optional[Tensor], attn_mask: Optional[Tensor], num_heads: int):
     # Verifies the expected shape for `query, `key`, `value`, `key_padding_mask` and `attn_mask`
     # and returns if the input is batched or not.
     # Raises an error if `query` is not 2-D (unbatched) or 3-D (batched) tensor.
@@ -599,59 +671,65 @@ def _mha_shape_check(query: Tensor, key: Tensor, value: Tensor,
     if query.dim() == 3:
         # Batched Inputs
         is_batched = True
-        assert key.dim() == 3 and value.dim() == 3, \
-            ("For batched (3-D) `query`, expected `key` and `value` to be 3-D"
-             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
         if key_padding_mask is not None:
-            assert key_padding_mask.dim() == 2, \
-                ("For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
-                 f" but found {key_padding_mask.dim()}-D tensor instead")
         if attn_mask is not None:
-            assert attn_mask.dim() in (2, 3), \
-                ("For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
-                 f" but found {attn_mask.dim()}-D tensor instead")
     elif query.dim() == 2:
         # Unbatched Inputs
         is_batched = False
-        assert key.dim() == 2 and value.dim() == 2, \
-            ("For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
-             f" but found {key.dim()}-D and {value.dim()}-D tensors respectively")
         if key_padding_mask is not None:
-            assert key_padding_mask.dim() == 1, \
-                ("For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
-                 f" but found {key_padding_mask.dim()}-D tensor instead")
         if attn_mask is not None:
-            assert attn_mask.dim() in (2, 3), \
-                ("For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
-                 f" but found {attn_mask.dim()}-D tensor instead")
             if attn_mask.dim() == 3:
                 expected_shape = (num_heads, query.shape[0], key.shape[0])
-                assert attn_mask.shape == expected_shape, \
-                    (f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}")
     else:
         raise AssertionError(
-            f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor")
     return is_batched
 def _canonical_mask(
-        mask: Optional[Tensor],
-        mask_name: str,
-        other_type: Optional[DType],
-        other_name: str,
-        target_type: DType,
-        check_other: bool = True,
 ) -> Optional[Tensor]:
     if mask is not None:
         _mask_dtype = mask.dtype
         _mask_is_float = torch.is_floating_point(mask)
         if _mask_dtype != torch.bool and not _mask_is_float:
-            raise AssertionError(
-                f"only bool and floating types of {mask_name} are supported")
         if check_other and other_type is not None:
             if _mask_dtype != other_type:
                 warnings.warn(
@@ -659,10 +737,7 @@ def _canonical_mask(
                     "is deprecated. Use same type for both instead."
                 )
         if not _mask_is_float:
-            mask = (
-                torch.zeros_like(mask, dtype=target_type)
-                .masked_fill_(mask, float("-inf"))
-            )
     return mask
@@ -673,6 +748,7 @@ def _none_or_dtype(input: Optional[Tensor]) -> Optional[DType]:
         return input.dtype
     raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor")
 def _in_projection_packed(
     q: Tensor,
     k: Tensor,
@@ -779,4 +855,4 @@ def _in_projection(
     assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
     assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
     assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
-    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)

+import warnings
 from functools import partial
 from typing import Optional, Tuple
+import numpy as np
 import torch
 import torch.nn.functional as F
+from torch import Tensor, nn
 from torch.nn.functional import *
+from torch.nn.init import trunc_normal_
 from torch.nn.modules.activation import *
 from transformers.integrations import is_deepspeed_zero3_enabled
 def get_2d_sincos_pos_embed(embed_dim, image_size):
     """
     image_size: image_size or (image_height, image_width)
     """
     assert embed_dim % 2 == 0
     omega = np.arange(embed_dim // 2, dtype=np.float32)
+    omega /= embed_dim / 2.0
+    omega = 1.0 / 10000**omega  # (D/2,)
+    out = np.einsum("hw,d->hwd", pos, omega)  # (H, W, D/2), outer product
     emb_sin = np.sin(out)  # (H, W, D/2)
     emb_cos = np.cos(out)  # (H, W, D/2)
     """
     def __init__(
+        self,
+        num_queries,
+        embed_dim,
+        num_heads,
+        kv_dim=None,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        adaptive=False,
+        max_size=(70, 70),
     ):
         super().__init__()
         self.num_queries = num_queries
         self.ln_kv = norm_layer(embed_dim)
         self.ln_post = norm_layer(embed_dim)
+        self.proj = nn.Parameter((embed_dim**-0.5) * torch.randn(embed_dim, embed_dim))
         self._set_2d_pos_cache(self.max_size)
+    def _set_2d_pos_cache(self, max_size, device="cpu"):
         if is_deepspeed_zero3_enabled():
+            device = "cuda"
         pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.embed_dim, max_size)).float().to(device)
         self.register_buffer("pos_embed", pos_embed, persistent=False)
     def _init_weights(self, m):
         if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=0.02)
             if isinstance(m, nn.Linear) and m.bias is not None:
                 nn.init.constant_(m.bias, 0)
         elif isinstance(m, nn.LayerNorm):
         for i in range(bs):
             tgt_h, tgt_w = tgt_sizes[i]
             pos_embed.append(self.pos_embed[:tgt_h, :tgt_w, :].reshape((tgt_h * tgt_w, -1)).to(dtype))  # patches * D
+            key_padding_mask[i, patch_len[i] :] = True
+        pos_embed = torch.nn.utils.rnn.pad_sequence(pos_embed, batch_first=True, padding_value=0.0).permute(
+            1, 0, 2
+        )  # BLD => L * B * D
         x = self.kv_proj(x)  # B * L * D
         x = self.ln_kv(x).permute(1, 0, 2)  # L * B * D
             self._repeat(q, bs),  # Q * B * D
             x + pos_embed,  # L * B * D +  L * B * D
             x,
+            key_padding_mask=key_padding_mask,
+        )[0]
         #  out: Q * B * D
         x = out.permute(1, 0, 2)  # B * Q * D
 class MultiheadAttention(nn.MultiheadAttention):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout=0.0,
+        bias=True,
+        add_bias_kv=False,
+        add_zero_attn=False,
+        kdim=None,
+        vdim=None,
+        batch_first=False,
+        device=None,
+        dtype=None,
+    ):
+        super().__init__(
+            embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first, device, dtype
+        )
         # rewrite out_proj layer，with nn.Linear
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias, device=device, dtype=dtype)
     def forward(
+        self,
+        query: Tensor,
+        key: Tensor,
+        value: Tensor,
+        key_padding_mask: Optional[Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[Tensor] = None,
+        average_attn_weights: bool = True,
+        is_causal: bool = False,
+    ) -> Tuple[Tensor, Optional[Tensor]]:
+        why_not_fast_path = ""
+        if (
+            (attn_mask is not None and torch.is_floating_point(attn_mask))
+            or (key_padding_mask is not None)
+            and torch.is_floating_point(key_padding_mask)
+        ):
             why_not_fast_path = "floating-point masks are not supported for fast path."
         is_batched = query.dim() == 3
             mask_name="key_padding_mask",
             other_type=F._none_or_dtype(attn_mask),
             other_name="attn_mask",
+            target_type=query.dtype,
         )
         attn_mask = _canonical_mask(
             check_other=False,
         )
         if not is_batched:
             why_not_fast_path = f"input not batched; expected query.dim() of 3 but got {query.dim()}"
         elif query is not key or key is not value:
             # they don't!
             why_not_fast_path = "non-self attention was used (query, key, and value are not the same Tensor)"
         elif self.in_proj_bias is not None and query.dtype != self.in_proj_bias.dtype:
+            why_not_fast_path = (
+                f"dtypes of query ({query.dtype}) and self.in_proj_bias ({self.in_proj_bias.dtype}) don't match"
+            )
         elif self.in_proj_weight is None:
             why_not_fast_path = "in_proj_weight was None"
         elif query.dtype != self.in_proj_weight.dtype:
             # this case will fail anyway, but at least they'll get a useful error message.
+            why_not_fast_path = (
+                f"dtypes of query ({query.dtype}) and self.in_proj_weight ({self.in_proj_weight.dtype}) don't match"
+            )
         elif self.training:
             why_not_fast_path = "training is enabled"
         elif (self.num_heads % 2) != 0:
             elif _is_make_fx_tracing():
                 why_not_fast_path = "we are running make_fx tracing"
             elif not all(_check_arg_device(x) for x in tensor_args):
+                why_not_fast_path = (
+                    "some Tensor argument's device is neither one of "
+                    f"cpu, cuda or {torch.utils.backend_registration._privateuse1_backend_name}"
+                )
             elif torch.is_grad_enabled() and any(_arg_requires_grad(x) for x in tensor_args):
+                why_not_fast_path = (
+                    "grad is enabled and at least one of query or the "
+                    "input/output projection weights or biases requires_grad"
+                )
             if not why_not_fast_path:
                 merged_mask, mask_type = self.merge_masks(attn_mask, key_padding_mask, query)
                         merged_mask,
                         need_weights,
                         average_attn_weights,
+                        mask_type,
+                    )
         any_nested = query.is_nested or key.is_nested or value.is_nested
+        assert not any_nested, (
+            "MultiheadAttention does not support NestedTensor outside of its fast path. "
+            + f"The fast path was not hit because {why_not_fast_path}"
+        )
         if self.batch_first and is_batched:
             # make sure that the transpose op does not affect the "is" property
                     value = key
             else:
                 query, key, value = (x.transpose(1, 0) for x in (query, key, value))
         if not self._qkv_same_embed_dim:
             attn_output, attn_output_weights = self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
                 training=self.training,
+                key_padding_mask=key_padding_mask,
+                need_weights=need_weights,
                 attn_mask=attn_mask,
                 use_separate_proj_weight=True,
+                q_proj_weight=self.q_proj_weight,
+                k_proj_weight=self.k_proj_weight,
                 v_proj_weight=self.v_proj_weight,
                 average_attn_weights=average_attn_weights,
+                is_causal=is_causal,
+            )
         else:
             attn_output, attn_output_weights = self.multi_head_attention_forward(
+                query,
+                key,
+                value,
+                self.embed_dim,
+                self.num_heads,
+                self.in_proj_weight,
+                self.in_proj_bias,
+                self.bias_k,
+                self.bias_v,
+                self.add_zero_attn,
+                self.dropout,
+                self.out_proj.weight,
+                self.out_proj.bias,
                 training=self.training,
                 key_padding_mask=key_padding_mask,
                 need_weights=need_weights,
                 attn_mask=attn_mask,
                 average_attn_weights=average_attn_weights,
+                is_causal=is_causal,
+            )
         if self.batch_first and is_batched:
             return attn_output.transpose(1, 0), attn_output_weights
         else:
             return attn_output, attn_output_weights
     def multi_head_attention_forward(
         self,
         query: Tensor,
         is_causal: bool = False,
     ) -> Tuple[Tensor, Optional[Tensor]]:
         tens_ops = (query, key, value, in_proj_weight, in_proj_bias, bias_k, bias_v, out_proj_weight, out_proj_bias)
         is_batched = _mha_shape_check(query, key, value, key_padding_mask, attn_mask, num_heads)
         # For unbatched input, we unsqueeze at the expected batch-dim to pretend that the input
         # is batched, run the computation and before returning squeeze the
         # batch dimension so that the output doesn't carry this temporary batch dimension.
             value = value.unsqueeze(1)
             if key_padding_mask is not None:
                 key_padding_mask = key_padding_mask.unsqueeze(0)
         # set up shape vars
         tgt_len, bsz, embed_dim = query.shape
         src_len, _, _ = key.shape
         key_padding_mask = _canonical_mask(
             mask=key_padding_mask,
             mask_name="key_padding_mask",
             other_type=_none_or_dtype(attn_mask),
             other_name="attn_mask",
+            target_type=query.dtype,
         )
         if is_causal and attn_mask is None:
             raise RuntimeError(
                 "Need attn_mask if specifying the is_causal hint. "
                 "You may use the Transformer module method "
                 "`generate_square_subsequent_mask` to create this mask."
             )
         if is_causal and key_padding_mask is None and not need_weights:
             # when we have a kpm or need weights, we need attn_mask
             # Otherwise, we use the is_causal hint go as is_causal
                 target_type=query.dtype,
                 check_other=False,
             )
             if key_padding_mask is not None:
                 # We have the attn_mask, and use that to merge kpm into it.
                 # Turn off use of is_causal hint, as the merged mask is no
                 # longer causal.
                 is_causal = False
+        assert (
+            embed_dim == embed_dim_to_check
+        ), f"was expecting embedding dimension of {embed_dim_to_check}, but got {embed_dim}"
         if isinstance(embed_dim, torch.Tensor):
             # embed_dim can be a tensor when JIT tracing
+            head_dim = embed_dim.div(num_heads, rounding_mode="trunc")
         else:
             head_dim = embed_dim // num_heads
         assert head_dim * num_heads == embed_dim, f"embed_dim {embed_dim} not divisible by num_heads {num_heads}"
         if use_separate_proj_weight:
             # allow MHA to have different embedding dimensions when separate projection weights are used
+            assert (
+                key.shape[:2] == value.shape[:2]
+            ), f"key's sequence and batch dims {key.shape[:2]} do not match value's {value.shape[:2]}"
         else:
             assert key.shape == value.shape, f"key shape {key.shape} does not match value shape {value.shape}"
         #
         # compute in-projection
         #
             else:
                 b_q, b_k, b_v = in_proj_bias.chunk(3)
             q, k, v = _in_projection(query, key, value, q_proj_weight, k_proj_weight, v_proj_weight, b_q, b_k, b_v)
         # prep attention mask
         if attn_mask is not None:
             # ensure attn_mask's dim is 3
             if attn_mask.dim() == 2:
                 correct_2d_size = (tgt_len, src_len)
                 if attn_mask.shape != correct_2d_size:
+                    raise RuntimeError(
+                        f"The shape of the 2D attn_mask is {attn_mask.shape}, but should be {correct_2d_size}."
+                    )
                 attn_mask = attn_mask.unsqueeze(0)
             elif attn_mask.dim() == 3:
                 correct_3d_size = (bsz * num_heads, tgt_len, src_len)
                 if attn_mask.shape != correct_3d_size:
+                    raise RuntimeError(
+                        f"The shape of the 3D attn_mask is {attn_mask.shape}, but should be {correct_3d_size}."
+                    )
             else:
                 raise RuntimeError(f"attn_mask's dimension {attn_mask.dim()} is not supported")
         # add bias along batch dimension (currently second)
         if bias_k is not None and bias_v is not None:
             assert static_k is None, "bias cannot be added to static key."
         else:
             assert bias_k is None
             assert bias_v is None
         #
         # reshape q, k, v for multihead attention and make em batch first
         #
             k = k.view(k.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         else:
             # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+            assert (
+                static_k.size(0) == bsz * num_heads
+            ), f"expecting static_k.size(0) of {bsz * num_heads}, but got {static_k.size(0)}"
+            assert (
+                static_k.size(2) == head_dim
+            ), f"expecting static_k.size(2) of {head_dim}, but got {static_k.size(2)}"
             k = static_k
         if static_v is None:
             v = v.view(v.shape[0], bsz * num_heads, head_dim).transpose(0, 1)
         else:
             # TODO finish disentangling control flow so we don't do in-projections when statics are passed
+            assert (
+                static_v.size(0) == bsz * num_heads
+            ), f"expecting static_v.size(0) of {bsz * num_heads}, but got {static_v.size(0)}"
+            assert (
+                static_v.size(2) == head_dim
+            ), f"expecting static_v.size(2) of {head_dim}, but got {static_v.size(2)}"
             v = static_v
         # add zero attention along batch dimension (now first)
         if add_zero_attn:
             zero_attn_shape = (bsz * num_heads, 1, head_dim)
                 attn_mask = pad(attn_mask, (0, 1))
             if key_padding_mask is not None:
                 key_padding_mask = pad(key_padding_mask, (0, 1))
         # update source sequence length after adjustments
         src_len = k.size(1)
         # merge key padding and attention masks
         if key_padding_mask is not None:
+            assert key_padding_mask.shape == (
+                bsz,
+                src_len,
+            ), f"expecting key_padding_mask shape of {(bsz, src_len)}, but got {key_padding_mask.shape}"
+            key_padding_mask = (
+                key_padding_mask.view(bsz, 1, 1, src_len)
+                .expand(-1, num_heads, -1, -1)
+                .reshape(bsz * num_heads, 1, src_len)
+            )
             if attn_mask is None:
                 attn_mask = key_padding_mask
             else:
                 attn_mask = attn_mask + key_padding_mask
         # adjust dropout probability
         if not training:
             dropout_p = 0.0
         #
         # (deep breath) calculate attention and out projection
         #
         if need_weights:
             B, Nt, E = q.shape
             q_scaled = q / math.sqrt(E)
             assert not (is_causal and attn_mask is None), "FIXME: is_causal not implemented for need_weights"
             if attn_mask is not None:
                 attn_output_weights = torch.baddbmm(attn_mask, q_scaled, k.transpose(-2, -1))
             else:
             attn_output_weights = softmax(attn_output_weights, dim=-1)
             if dropout_p > 0.0:
                 attn_output_weights = dropout(attn_output_weights, p=dropout_p)
             attn_output = torch.bmm(attn_output_weights, v)
             attn_output = attn_output.transpose(0, 1).contiguous().view(tgt_len * bsz, embed_dim)
             attn_output = self.out_proj(attn_output)
             attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
             # optionally average attention weights over heads
             attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len, src_len)
             if average_attn_weights:
                 attn_output_weights = attn_output_weights.mean(dim=1)
             if not is_batched:
                 # squeeze the output if input was unbatched
                 attn_output = attn_output.squeeze(1)
                     attn_mask = attn_mask.unsqueeze(0)
                 else:
                     attn_mask = attn_mask.view(bsz, num_heads, -1, src_len)
             q = q.view(bsz, num_heads, tgt_len, head_dim)
             k = k.view(bsz, num_heads, src_len, head_dim)
             v = v.view(bsz, num_heads, src_len, head_dim)
             attn_output = F.scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)
             attn_output = attn_output.permute(2, 0, 1, 3).contiguous().view(bsz * tgt_len, embed_dim)
             attn_output = self.out_proj(attn_output)
             attn_output = attn_output.view(tgt_len, bsz, attn_output.size(1))
             if not is_batched:
             return attn_output, None
+def _mha_shape_check(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    key_padding_mask: Optional[Tensor],
+    attn_mask: Optional[Tensor],
+    num_heads: int,
+):
     # Verifies the expected shape for `query, `key`, `value`, `key_padding_mask` and `attn_mask`
     # and returns if the input is batched or not.
     # Raises an error if `query` is not 2-D (unbatched) or 3-D (batched) tensor.
     if query.dim() == 3:
         # Batched Inputs
         is_batched = True
+        assert key.dim() == 3 and value.dim() == 3, (
+            "For batched (3-D) `query`, expected `key` and `value` to be 3-D"
+            f" but found {key.dim()}-D and {value.dim()}-D tensors respectively"
+        )
         if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 2, (
+                "For batched (3-D) `query`, expected `key_padding_mask` to be `None` or 2-D"
+                f" but found {key_padding_mask.dim()}-D tensor instead"
+            )
         if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), (
+                "For batched (3-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                f" but found {attn_mask.dim()}-D tensor instead"
+            )
     elif query.dim() == 2:
         # Unbatched Inputs
         is_batched = False
+        assert key.dim() == 2 and value.dim() == 2, (
+            "For unbatched (2-D) `query`, expected `key` and `value` to be 2-D"
+            f" but found {key.dim()}-D and {value.dim()}-D tensors respectively"
+        )
         if key_padding_mask is not None:
+            assert key_padding_mask.dim() == 1, (
+                "For unbatched (2-D) `query`, expected `key_padding_mask` to be `None` or 1-D"
+                f" but found {key_padding_mask.dim()}-D tensor instead"
+            )
         if attn_mask is not None:
+            assert attn_mask.dim() in (2, 3), (
+                "For unbatched (2-D) `query`, expected `attn_mask` to be `None`, 2-D or 3-D"
+                f" but found {attn_mask.dim()}-D tensor instead"
+            )
             if attn_mask.dim() == 3:
                 expected_shape = (num_heads, query.shape[0], key.shape[0])
+                assert (
+                    attn_mask.shape == expected_shape
+                ), f"Expected `attn_mask` shape to be {expected_shape} but got {attn_mask.shape}"
     else:
         raise AssertionError(
+            f"query should be unbatched 2D or batched 3D tensor but received {query.dim()}-D query tensor"
+        )
     return is_batched
 def _canonical_mask(
+    mask: Optional[Tensor],
+    mask_name: str,
+    other_type: Optional[DType],
+    other_name: str,
+    target_type: DType,
+    check_other: bool = True,
 ) -> Optional[Tensor]:
     if mask is not None:
         _mask_dtype = mask.dtype
         _mask_is_float = torch.is_floating_point(mask)
         if _mask_dtype != torch.bool and not _mask_is_float:
+            raise AssertionError(f"only bool and floating types of {mask_name} are supported")
         if check_other and other_type is not None:
             if _mask_dtype != other_type:
                 warnings.warn(
                     "is deprecated. Use same type for both instead."
                 )
         if not _mask_is_float:
+            mask = torch.zeros_like(mask, dtype=target_type).masked_fill_(mask, float("-inf"))
     return mask
         return input.dtype
     raise RuntimeError("input to _none_or_dtype() must be None or torch.Tensor")
 def _in_projection_packed(
     q: Tensor,
     k: Tensor,
     assert b_q is None or b_q.shape == (Eq,), f"expecting query bias shape of {(Eq,)}, but got {b_q.shape}"
     assert b_k is None or b_k.shape == (Eq,), f"expecting key bias shape of {(Eq,)}, but got {b_k.shape}"
     assert b_v is None or b_v.shape == (Eq,), f"expecting value bias shape of {(Eq,)}, but got {b_v.shape}"
+    return linear(q, w_q, b_q), linear(k, w_k, b_k), linear(v, w_v, b_v)

tokenization_minicpmv_fast.py CHANGED Viewed

@@ -40,7 +40,7 @@ class MiniCPMVTokenizerFast(Qwen2TokenizerFast):
     @property
     def slice_start_id(self):
         return self.convert_tokens_to_ids(self.slice_start)
     @property
     def slice_end_id(self):
         return self.convert_tokens_to_ids(self.slice_end)
@@ -48,14 +48,14 @@ class MiniCPMVTokenizerFast(Qwen2TokenizerFast):
     @property
     def im_id_start_id(self):
         return self.convert_tokens_to_ids(self.im_id_start)
     @property
     def im_id_end_id(self):
         return self.convert_tokens_to_ids(self.im_id_end)
     @property
     def newline_id(self):
-        return self.convert_tokens_to_ids('\n')
     @staticmethod
     def escape(text: str) -> str:
@@ -63,4 +63,4 @@ class MiniCPMVTokenizerFast(Qwen2TokenizerFast):
     @staticmethod
     def unescape(text: str) -> str:
-        return text

     @property
     def slice_start_id(self):
         return self.convert_tokens_to_ids(self.slice_start)
     @property
     def slice_end_id(self):
         return self.convert_tokens_to_ids(self.slice_end)
     @property
     def im_id_start_id(self):
         return self.convert_tokens_to_ids(self.im_id_start)
     @property
     def im_id_end_id(self):
         return self.convert_tokens_to_ids(self.im_id_end)
     @property
     def newline_id(self):
+        return self.convert_tokens_to_ids("\n")
     @staticmethod
     def escape(text: str) -> str:
     @staticmethod
     def unescape(text: str) -> str:
+        return text