WeMM / vision_model.py

feipengma

update wemm

ff26c9f 5 months ago

32.4 kB

	from transformers import PretrainedConfig, PreTrainedModel

	import inspect
	import math
	from dataclasses import dataclass
	from typing import Dict, List, Optional, Tuple, Union
	import json

	import torch
	import torch.nn.functional as F
	import torch.utils.checkpoint
	from torch import nn
	from torch.nn import CrossEntropyLoss

	from transformers.activations import ACT2FN
	from transformers.cache_utils import Cache, DynamicCache
	from transformers.modeling_attn_mask_utils import _prepare_4d_attention_mask
	from transformers.modeling_outputs import BaseModelOutput, ModelOutput
	from transformers.utils import (
	add_start_docstrings,
	add_start_docstrings_to_model_forward,
	is_flash_attn_2_available,
	is_flash_attn_greater_or_equal_2_10,
	logging,
	replace_return_docstrings,
	)

	if is_flash_attn_2_available():
	from flash_attn import flash_attn_func, flash_attn_varlen_func
	from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa

	_flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)


	class Idefics2VisionConfig(PretrainedConfig):
	r"""
	This is the configuration class to store the configuration of a [`Idefics2VisionModel`]. It is used to instantiate a
	Idefics2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a
	configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint
	[google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics2 model
	[HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b).

	Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
	documentation from [`PretrainedConfig`] for more information.

	Args:
	hidden_size (`int`, optional, defaults to 768):
	Dimensionality of the encoder layers and the pooler layer.
	intermediate_size (`int`, optional, defaults to 3072):
	Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
	num_hidden_layers (`int`, optional, defaults to 12):
	Number of hidden layers in the Transformer encoder.
	num_attention_heads (`int`, optional, defaults to 12):
	Number of attention heads for each attention layer in the Transformer encoder.
	num_channels (`int`, optional, defaults to 3):
	Number of channels in the input images.
	image_size (`int`, optional, defaults to 224):
	The size (resolution) of each image.
	patch_size (`int`, optional, defaults to 32):
	The size (resolution) of each patch.
	hidden_act (`str` or `function`, optional, defaults to `"gelu_pytorch_tanh"`):
	The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
	`"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
	layer_norm_eps (`float`, optional, defaults to 1e-06):
	The epsilon used by the layer normalization layers.
	attention_dropout (`float`, optional, defaults to 0.0):
	The dropout ratio for the attention probabilities.
	intializer_range (`float`, optional, defaults to 0.02):
	The standard deviation for initializing all weight matrices in the model.

	Example:

	```python
	>>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer
	>>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig

	>>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration
	>>> configuration = Idefics2VisionConfig()

	>>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration
	>>> model = Idefics2VisionTransformer(configuration)

	>>> # Accessing the model configuration
	>>> configuration = model.config
	```"""
	_auto_class = 'AutoConfig'
	model_type = "Idefics2VisionConfig"

	def __init__(
	self,
	hidden_size=768,
	intermediate_size=3072,
	num_hidden_layers=12,
	num_attention_heads=12,
	num_channels=3,
	image_size=224,
	patch_size=32,
	hidden_act="gelu_pytorch_tanh",
	layer_norm_eps=1e-6,
	attention_dropout=0.0,
	initializer_range=0.02,
	model_type='Idefics2VisionConfig',
	**kwargs,
	):
	super().__init__(**kwargs)

	self.hidden_size = hidden_size
	self.intermediate_size = intermediate_size
	self.num_hidden_layers = num_hidden_layers
	self.num_attention_heads = num_attention_heads
	self.num_channels = num_channels
	self.patch_size = patch_size
	self.image_size = image_size
	self.attention_dropout = attention_dropout
	self.layer_norm_eps = layer_norm_eps
	self.hidden_act = hidden_act
	self.initializer_range = initializer_range
	"""
	@classmethod
	def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig":

	with open(pretrained_model_name_or_path, "r", encoding="utf-8") as f:
	config_dict = json.load(f)

	cls = Idefics2VisionConfig(
	hidden_size=config_dict["hidden_size"],
	image_size=config_dict["image_size"],
	intermediate_size = config_dict["intermediate_size"],
	model_type=config_dict["model_type"],
	num_attention_heads = config_dict["num_attention_heads"],
	num_hidden_layers = config_dict["num_hidden_layers"],
	patch_size = config_dict["patch_size"]
	)

	return cls
	"""
	# Copied from transformers.models.llama.modeling_llama._get_unpad_data
	def _get_unpad_data(attention_mask):
	seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
	indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
	max_seqlen_in_batch = seqlens_in_batch.max().item()
	cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
	return (
	indices,
	cu_seqlens,
	max_seqlen_in_batch,
	)

	# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision
	class Idefics2VisionAttention(nn.Module):
	"""Multi-headed attention from 'Attention Is All You Need' paper"""

	# Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.embed_dim = config.hidden_size
	self.num_heads = config.num_attention_heads
	self.head_dim = self.embed_dim // self.num_heads
	if self.head_dim * self.num_heads != self.embed_dim:
	raise ValueError(
	f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
	f" {self.num_heads})."
	)
	self.scale = self.head_dim**-0.5
	self.dropout = config.attention_dropout

	self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
	self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)

	# Ignore copy
	self.is_causal = False

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
	"""Input shape: Batch x Time x Channel"""

	batch_size, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)

	k_v_seq_len = key_states.shape[-2]
	attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale

	if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
	f" {attn_weights.size()}"
	)

	if attention_mask is not None:
	if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
	raise ValueError(
	f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
	)
	attn_weights = attn_weights + attention_mask

	# upcast attention to fp32
	attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
	attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
	attn_output = torch.matmul(attn_weights, value_states)

	if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
	raise ValueError(
	f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
	f" {attn_output.size()}"
	)

	attn_output = attn_output.transpose(1, 2).contiguous()
	attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)

	attn_output = self.out_proj(attn_output)

	return attn_output, attn_weights


	class Idefics2VisionFlashAttention2(Idefics2VisionAttention):
	"""
	Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays
	untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
	flash attention and deal with padding tokens in case the input contains any of them.
	"""

	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
	# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
	# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
	self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.LongTensor] = None,
	position_ids: Optional[torch.LongTensor] = None,
	past_key_value: Optional[Cache] = None,
	output_attentions: bool = False,
	use_cache: bool = False,
	**kwargs,
	) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:


	output_attentions = False

	bsz, q_len, _ = hidden_states.size()

	query_states = self.q_proj(hidden_states)
	key_states = self.k_proj(hidden_states)
	value_states = self.v_proj(hidden_states)

	# Flash attention requires the input to have the shape
	# batch_size x seq_length x head_dim x hidden_dim
	# therefore we just need to keep the original shape
	query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
	value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)

	kv_seq_len = key_states.shape[-2]
	if past_key_value is not None:
	kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)

	# TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
	# to be able to avoid many of these transpose/reshape/view.
	query_states = query_states.transpose(1, 2)
	key_states = key_states.transpose(1, 2)
	value_states = value_states.transpose(1, 2)

	dropout_rate = self.dropout if self.training else 0.0

	# In PEFT, usually we cast the layer norms in float32 for training stability reasons
	# therefore the input hidden states gets silently casted in float32. Hence, we need
	# cast them back in the correct dtype just to be sure everything works as expected.
	# This might slowdown training & inference so it is recommended to not cast the LayerNorms
	# in fp32. (Idefics2VisionRMSNorm handles it correctly)

	input_dtype = query_states.dtype
	if input_dtype == torch.float32:
	if torch.is_autocast_enabled():
	target_dtype = torch.get_autocast_gpu_dtype()
	# Handle the case where the model is quantized
	elif hasattr(self.config, "_pre_quantization_dtype"):
	target_dtype = self.config._pre_quantization_dtype
	else:
	target_dtype = self.q_proj.weight.dtype

	logger.warning_once(
	f"The input hidden states seems to be silently casted in float32, this might be related to"
	f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
	f" {target_dtype}."
	)

	query_states = query_states.to(target_dtype)
	key_states = key_states.to(target_dtype)
	value_states = value_states.to(target_dtype)

	attn_output = self._flash_attention_forward(
	query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
	)

	attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous()
	attn_output = self.out_proj(attn_output)

	if not output_attentions:
	attn_weights = None

	return attn_output, attn_weights

	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
	def _flash_attention_forward(
	self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
	):
	"""
	Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
	first unpad the input, then computes the attention scores and pad the final attention scores.

	Args:
	query_states (`torch.Tensor`):
	Input query states to be passed to Flash Attention API
	key_states (`torch.Tensor`):
	Input key states to be passed to Flash Attention API
	value_states (`torch.Tensor`):
	Input value states to be passed to Flash Attention API
	attention_mask (`torch.Tensor`):
	The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
	position of padding tokens and 1 for the position of non-padding tokens.
	dropout (`float`):
	Attention dropout
	softmax_scale (`float`, optional):
	The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
	"""
	if not self._flash_attn_uses_top_left_mask:
	causal = self.is_causal
	else:
	# TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
	causal = self.is_causal and query_length != 1

	# Contains at least one padding token in the sequence
	if attention_mask is not None:
	batch_size = query_states.shape[0]
	query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
	query_states, key_states, value_states, attention_mask, query_length
	)

	cu_seqlens_q, cu_seqlens_k = cu_seq_lens
	max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens

	attn_output_unpad = flash_attn_varlen_func(
	query_states,
	key_states,
	value_states,
	cu_seqlens_q=cu_seqlens_q,
	cu_seqlens_k=cu_seqlens_k,
	max_seqlen_q=max_seqlen_in_batch_q,
	max_seqlen_k=max_seqlen_in_batch_k,
	dropout_p=dropout,
	softmax_scale=softmax_scale,
	causal=causal,
	)

	attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
	else:
	attn_output = flash_attn_func(
	query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
	)

	return attn_output

	# Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
	def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
	indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
	batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape

	key_layer = index_first_axis(
	key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
	)
	value_layer = index_first_axis(
	value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
	)
	if query_length == kv_seq_len:
	query_layer = index_first_axis(
	query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
	)
	cu_seqlens_q = cu_seqlens_k
	max_seqlen_in_batch_q = max_seqlen_in_batch_k
	indices_q = indices_k
	elif query_length == 1:
	max_seqlen_in_batch_q = 1
	cu_seqlens_q = torch.arange(
	batch_size + 1, dtype=torch.int32, device=query_layer.device
	) # There is a memcpy here, that is very bad.
	indices_q = cu_seqlens_q[:-1]
	query_layer = query_layer.squeeze(1)
	else:
	# The -q_len: slice assumes left padding.
	attention_mask = attention_mask[:, -query_length:]
	query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)

	return (
	query_layer,
	key_layer,
	value_layer,
	indices_q,
	(cu_seqlens_q, cu_seqlens_k),
	(max_seqlen_in_batch_q, max_seqlen_in_batch_k),
	)

	IDEFICS_VISION_ATTENTION_CLASSES = {
	"eager": Idefics2VisionAttention,
	"flash_attention_2": Idefics2VisionFlashAttention2,
	}

	# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision
	class Idefics2VisionMLP(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.config = config
	self.activation_fn = ACT2FN[config.hidden_act]
	self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
	self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	hidden_states = self.fc1(hidden_states)
	hidden_states = self.activation_fn(hidden_states)
	hidden_states = self.fc2(hidden_states)
	return hidden_states

	class Idefics2EncoderLayer(nn.Module):
	def __init__(self, config: Idefics2VisionConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config)
	self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
	self.mlp = Idefics2VisionMLP(config)
	self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)

	# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward
	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: torch.Tensor,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.FloatTensor]:
	"""
	Args:
	hidden_states (`torch.FloatTensor`):
	Input to the layer of shape `(batch, seq_len, embed_dim)`.
	attention_mask (`torch.FloatTensor`):
	Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
	output_attentions (`bool`, optional, defaults to `False`):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	"""
	residual = hidden_states

	hidden_states = self.layer_norm1(hidden_states)
	hidden_states, attn_weights = self.self_attn(
	hidden_states=hidden_states,
	attention_mask=attention_mask,
	output_attentions=output_attentions,
	)
	hidden_states = residual + hidden_states

	residual = hidden_states
	hidden_states = self.layer_norm2(hidden_states)
	hidden_states = self.mlp(hidden_states)
	hidden_states = residual + hidden_states

	outputs = (hidden_states,)

	if output_attentions:
	outputs += (attn_weights,)

	return outputs

	# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics2
	class Idefics2Encoder(nn.Module):
	"""
	Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
	[`Idefics2EncoderLayer`].

	Args:
	config: Idefics2VisionConfig
	"""

	def __init__(self, config: Idefics2VisionConfig):
	super().__init__()
	self.config = config
	self.layers = nn.ModuleList([Idefics2EncoderLayer(config) for _ in range(config.num_hidden_layers)])
	self.gradient_checkpointing = False

	# Ignore copy
	def forward(
	self,
	inputs_embeds,
	attention_mask: Optional[torch.Tensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutput]:
	r"""
	Args:
	inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
	Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
	This is useful if you want more control over how to convert `input_ids` indices into associated vectors
	than the model's internal embedding lookup matrix.
	attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, optional):
	Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

	- 1 for tokens that are not masked,
	- 0 for tokens that are masked.

	[What are attention masks?](../glossary#attention-mask)
	output_attentions (`bool`, optional):
	Whether or not to return the attentions tensors of all attention layers. See `attentions` under
	returned tensors for more detail.
	output_hidden_states (`bool`, optional):
	Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
	for more detail.
	return_dict (`bool`, optional):
	Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
	"""
	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	encoder_states = () if output_hidden_states else None
	all_attentions = () if output_attentions else None

	hidden_states = inputs_embeds
	for encoder_layer in self.layers:
	if output_hidden_states:
	encoder_states = encoder_states + (hidden_states,)
	if self.gradient_checkpointing and self.training:
	layer_outputs = self._gradient_checkpointing_func(
	encoder_layer.__call__,
	hidden_states,
	attention_mask,
	output_attentions,
	)
	else:
	layer_outputs = encoder_layer(
	hidden_states,
	attention_mask,
	output_attentions=output_attentions,
	)

	hidden_states = layer_outputs[0]

	if output_attentions:
	all_attentions = all_attentions + (layer_outputs[1],)

	if output_hidden_states:
	encoder_states = encoder_states + (hidden_states,)

	if not return_dict:
	return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
	return BaseModelOutput(
	last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
	)

	class Idefics2VisionEmbeddings(nn.Module):
	"""
	This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable
	resolution.

	The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304)
	which allows treating images in their native aspect ratio and without the need to resize them to the same
	fixed size. In particular, we start from the original pre-trained SigLIP model
	(which uses images of fixed-size square images) and adapt it by training on images of variable resolutions.
	"""

	def __init__(self, config: Idefics2VisionConfig):
	super().__init__()
	self.embed_dim = config.hidden_size
	self.image_size = config.image_size
	self.patch_size = config.patch_size

	self.patch_embedding = nn.Conv2d(
	in_channels=config.num_channels,
	out_channels=self.embed_dim,
	kernel_size=self.patch_size,
	stride=self.patch_size,
	padding="valid",
	)

	self.num_patches_per_side = self.image_size // self.patch_size
	self.num_patches = self.num_patches_per_side**2
	self.num_positions = self.num_patches
	self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)

	def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor:
	batch_size, _, max_im_h, max_im_w = pixel_values.shape

	patch_embeds = self.patch_embedding(pixel_values)
	embeddings = patch_embeds.flatten(2).transpose(1, 2)

	max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size
	boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side)
	position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0)

	for batch_idx, p_attn_mask in enumerate(patch_attention_mask):
	nb_patches_h = p_attn_mask[:, 0].sum()
	nb_patches_w = p_attn_mask[0].sum()

	fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h)
	fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w)

	bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True)
	bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)

	pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
	position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids

	position_ids = position_ids.to(self.position_embedding.weight.device)
	embeddings = embeddings + self.position_embedding(position_ids)
	return embeddings


	class Idefics2VisionTransformer(PreTrainedModel):
	_auto_class = 'AutoModel'
	config_class = Idefics2VisionConfig
	supports_gradient_checkpointing = True

	def __init__(self, config: Idefics2VisionConfig):
	super().__init__(config)
	embed_dim = config.hidden_size

	config._attn_implementation = "flash_attention_2"
	self._use_flash_attention_2 = True
	self.config = config
	self.embeddings = Idefics2VisionEmbeddings(config)
	self.encoder = Idefics2Encoder(config)
	self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)


	def get_input_embeddings(self):
	return self.embeddings

	def set_input_embeddings(self, value):
	self.embeddings = value

	def forward(
	self,
	pixel_values,
	patch_attention_mask: Optional[torch.BoolTensor] = None,
	output_attentions: Optional[bool] = None,
	output_hidden_states: Optional[bool] = None,
	return_dict: Optional[bool] = None,
	) -> Union[Tuple, BaseModelOutput]:

	pixel_values = pixel_values.to(torch.bfloat16)

	output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
	output_hidden_states = (
	output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
	)
	return_dict = return_dict if return_dict is not None else self.config.use_return_dict

	batch_size = pixel_values.size(0)
	if patch_attention_mask is None:
	patch_size = self.config.patch_size
	patch_attention_mask = torch.ones(
	(
	batch_size,
	pixel_values.size(2) // patch_size,
	pixel_values.size(3) // patch_size,
	)
	)
	patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device)


	hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)

	patch_attention_mask = patch_attention_mask.view(batch_size, -1)
	# The call to `_upad_input` in `_flash_attention_forward` is expensive
	# So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
	# avoiding passing the attention_mask, which is equivalent to attending to the full sequence
	if not torch.any(~patch_attention_mask):
	patch_attention_mask = None
	elif not self._use_flash_attention_2:
	patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype)

	encoder_outputs = self.encoder(
	inputs_embeds=hidden_states,
	attention_mask=patch_attention_mask,
	output_attentions=output_attentions,
	output_hidden_states=output_hidden_states,
	return_dict=return_dict,
	)

	last_hidden_state = encoder_outputs[0]
	last_hidden_state = self.post_layernorm(last_hidden_state)

	if not return_dict:
	return (last_hidden_state,) + encoder_outputs[1:]

	return BaseModelOutput(
	last_hidden_state=last_hidden_state,
	hidden_states=encoder_outputs.hidden_states,
	attentions=encoder_outputs.attentions,
	)