Upload 5 files

Browse files

Files changed (5) hide show

audio_processing_mllama.py +61 -0
configuration_llama3.py +112 -0
mllama_audio_model.py +74 -0
modeling_llama3.py +674 -0
processing_mllama.py +377 -0

audio_processing_mllama.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import math
+from typing import Dict, List, Optional, Union
+import numpy as np
+import transformers
+from transformers.tokenization_utils_base import AudioInput
+from transformers.models.seamless_m4t.feature_extraction_seamless_m4t import SeamlessM4TFeatureExtractor
+from transformers.utils import TensorType
+from transformers.feature_extraction_utils import BatchFeature
+from transformers import AutoFeatureExtractor
+def build_audio_tokens(text: List[str], audio_features: Union[Dict, List[List[np.ndarray]]], audio_token="<|audio|>") -> Dict:
+    if not isinstance(audio_features, list):
+        audio_features = audio_features['audio_features']
+    bs = audio_features.shape[0]
+    for i in range(bs):
+        for j in range(len(audio_features[i])):
+            tgt_token = f"<|audio_{j+1}|>" * get_num_embeddings(audio_features[i][j].shape[0])
+            text[i] = text[i].replace(audio_token, tgt_token, 1)
+    return text
+def get_num_embeddings(num_framses, adapter_kernel_size=7, adapter_stride=4) -> int:
+    return math.ceil((num_framses - adapter_kernel_size) / adapter_stride) + 1 + 2 # 2 = <|begin_of_audio|>, <|end_of_audio|>
+class MllamaAudioFeatureExtractor(SeamlessM4TFeatureExtractor):
+    def __call__(
+        self,
+        batch_audio_clips: List[List[AudioInput]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        audio_features = [[ super(MllamaAudioFeatureExtractor, self).__call__(audio_j, sampling_rate=16000, return_attention_mask=False)['input_features'][0] for audio_j in audio_i ] for audio_i in batch_audio_clips ]
+        packed_audio_features = self.pack_audio_clips(audio_features)
+        encoded_audio_inputs = BatchFeature(
+            data={
+                "audio_features": packed_audio_features,
+            },
+            tensor_type=return_tensors,
+        )
+        return encoded_audio_inputs
+    def pack_audio_clips(self, batch_audio_clips: List[List[np.ndarray]]) -> np.ndarray:
+        assert batch_audio_clips[0][0].ndim == 2 # sequence length x feature dimension
+        # Determine output shape: (batch_size, max_num_clips, max_frames, feature_dim)
+        batch_size = len(batch_audio_clips)
+        max_num_clips = max([len(clips) for clips in batch_audio_clips])
+        max_frames = max([clip.shape[0] for clips in batch_audio_clips for clip in clips])
+        feature_dim = batch_audio_clips[0][0].shape[1]
+        stacked_audio_clips = np.zeros((batch_size, max_num_clips, max_frames, feature_dim), dtype=np.float32)
+        for i, clips in enumerate(batch_audio_clips):
+            for j, clip in enumerate(clips):
+                stacked_audio_clips[i, j, :clip.shape[0], :] = clip
+        return stacked_audio_clips
+AutoFeatureExtractor.register("MllamaAudioFeatureExtractor", MllamaAudioFeatureExtractor)
+transformers.MllamaAudioFeatureExtractor = MllamaAudioFeatureExtractor

configuration_llama3.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# coding=utf-8
+# Copyright 2024 HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Mllama model configuration"""
+import os
+from typing import Dict, List, Optional, Union
+import transformers
+from transformers.configuration_utils import PretrainedConfig
+from transformers.modeling_rope_utils import rope_config_validation
+from transformers.utils import logging
+from transformers import Wav2Vec2BertConfig, AutoConfig
+from transformers.models.mllama.configuration_mllama import MllamaVisionConfig, MllamaTextConfig
+logger = logging.get_logger(__name__)
+class Llama3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MllamaForConditionalGeneration`]. It is used to instantiate an
+    Mllama model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mllama-9B.
+    e.g. [meta-llama/Llama-3.2-11B-Vision](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `MllamaVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `MllamaTextConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 128256):
+            The image token index to encode the image prompt.
+    Example:
+    ```python
+    >>> from transformers import MllamaForConditionalGeneration, MllamaConfig, MllamaVisionConfig, MllamaTextConfig
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = MllamaVisionConfig()
+    >>> # Initializing a Llama config
+    >>> text_config = MllamaTextConfig()
+    >>> # Initializing a mllama-11b style configuration
+    >>> configuration = MllamaConfig(vision_config, text_config)
+    >>> # Initializing a model from the mllama-11b style configuration
+    >>> model = MllamaForConditionalGeneration(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+    model_type = "llama3"
+    is_composition = True
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        audio_config=None,
+        image_token_index=128256,
+        audio_token_index=128257,
+        **kwargs,
+    ):
+        if vision_config is None:
+            self.vision_config = MllamaVisionConfig()
+            logger.info("vision_config is None, using default mllama vision config")
+        elif isinstance(vision_config, dict):
+            self.vision_config = MllamaVisionConfig(**vision_config)
+        elif isinstance(vision_config, MllamaVisionConfig):
+            self.vision_config = vision_config
+        self.image_token_index = image_token_index
+        if audio_config is None:
+            self.audio_config = Wav2Vec2BertConfig()
+            logger.info("audio_config is None, using default mllama audio config")
+        elif isinstance(audio_config, dict):
+            self.audio_config = Wav2Vec2BertConfig(**audio_config)
+        elif isinstance(audio_config, Wav2Vec2BertConfig):
+            self.audio_config = audio_config
+        self.audio_token_index = audio_token_index
+        if text_config is None:
+            self.text_config = MllamaTextConfig()
+            logger.info("text_config is None, using default mllama text config")
+        elif isinstance(text_config, dict):
+            self.text_config = MllamaTextConfig(**text_config)
+        elif isinstance(text_config, MllamaTextConfig):
+            self.text_config = text_config
+        super().__init__(**kwargs)
+AutoConfig.register("llama3", Llama3Config)
+transformers.Llama3Config = Llama3Config

mllama_audio_model.py ADDED Viewed

	@@ -0,0 +1,74 @@

+from typing import Optional, Tuple, Union
+import torch
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutput
+from transformers import Wav2Vec2BertModel, Wav2Vec2BertConfig, Wav2Vec2BertPreTrainedModel
+from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+class Llama3Embedding(Wav2Vec2BertPreTrainedModel):
+    base_model_prefix = "audio_model"
+    def __init__(self, config: Wav2Vec2BertConfig, text_config: MllamaTextConfig):
+        super().__init__(config)
+        assert config.add_adapter is True, f'{type(self).__name__} requires add adapter to be true.'
+        assert config.output_hidden_size == text_config.hidden_size
+        self.text_embeddings = nn.Embedding(text_config.vocab_size, text_config.hidden_size, text_config.pad_token_id)
+        self.audio_embedding = Wav2Vec2BertModel(config)
+        #assert self.text_embeddings.weight.size(-1) == text_config.hidden_size, f"{self.text_embeddings.weight}, {text_config.hidden_size=}, {text_config.vocab_size=}"
+        self.start_of_audio = nn.Parameter(data=torch.zeros((1, config.output_hidden_size)), requires_grad=True)
+        self.end_of_audio = nn.Parameter(data=torch.zeros((1, config.output_hidden_size)), requires_grad=True)
+        self.text_config = text_config
+    def _init_weights(self, module):
+        std = self.text_config.initializer_range
+        """Initialize the weights"""
+        if isinstance(module, Wav2Vec2BertSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2BertFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.Parameter):
+            module.data.normal_(mean=0.0, std=std)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        audio_features: Optional[torch.Tensor] = None,
+    ) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
+        input_embeddings = self.text_embeddings(torch.clamp(input_ids, min=0))
+        if audio_features is None:
+            return input_embeddings
+        bs, max_num_img, l, d = audio_features.shape
+        audio_embeddings = self.audio_embedding(input_features=audio_features.view((bs*max_num_img, l, d)))['last_hidden_state']
+        audio_embeddings = audio_embeddings.view((bs, max_num_img, -1, self.start_of_audio.shape[-1]))
+        for i in range(bs):
+            for j in range(max_num_img):
+                audio_id = -1 - j
+                if torch.any(input_ids[i] == audio_id):
+                    positions = torch.nonzero(input_ids[i] == audio_id, as_tuple=True)
+                    seq_len = input_embeddings[i][positions].shape[0] - 2
+                    input_embeddings[i] = input_embeddings[i].index_put(positions, torch.concat([self.start_of_audio, audio_embeddings[i, j, :seq_len, :], self.end_of_audio]), accumulate=False)
+        return input_embeddings

modeling_llama3.py ADDED Viewed

	@@ -0,0 +1,674 @@

+# coding=utf-8
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.utils.checkpoint
+from torch import nn
+import transformers
+from transformers import MllamaPreTrainedModel, MllamaVisionModel, MllamaForCausalLM, Wav2Vec2BertConfig, AutoModel, AutoModelForCausalLM
+from transformers.cache_utils import Cache, StaticCache
+from transformers.generation import GenerationMixin
+from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+from transformers.modeling_outputs import CausalLMOutputWithPast, BaseModelOutputWithPast
+from transformers.utils import logging
+from transformers.models.mllama.modeling_mllama import _prepare_cross_attention_mask, MllamaCrossAttentionDecoderLayer, MllamaSelfAttentionDecoderLayer, MllamaTextRMSNorm, MllamaRotaryEmbedding
+from transformers.models.mllama.configuration_mllama import MllamaTextConfig
+from .configuration_llama3 import Llama3Config
+from .mllama_audio_model import Llama3Embedding
+logger = logging.get_logger(__name__)
+class Llama3PreTrainedModel(MllamaPreTrainedModel):
+    config_class = Llama3Config
+    base_model_prefix = "model"
+class Llama3TextModel(MllamaPreTrainedModel):
+    config_class = MllamaTextConfig
+    base_model_prefix = "language_model.model"
+    def __init__(self, config: MllamaTextConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        #self.embed_tokens = Llama3Embedding(audio_config, config)
+        self.cross_attention_layers = config.cross_attention_layers
+        layers = []
+        for layer_idx in range(config.num_hidden_layers):
+            if layer_idx in self.cross_attention_layers:
+                layers.append(MllamaCrossAttentionDecoderLayer(config, layer_idx))
+            else:
+                layers.append(MllamaSelfAttentionDecoderLayer(config, layer_idx))
+        self.layers = nn.ModuleList(layers)
+        self.norm = MllamaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = MllamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self):
+        #return self.embed_tokens.text_embeddings
+        return None
+    def set_input_embeddings(self, value):
+        #self.embed_tokens.text_embeddings = value
+        pass
+    def forward(
+        self,
+        #input_ids: Optional[torch.LongTensor] = None,
+        #audio_features: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cross_attention_states: Optional[torch.FloatTensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPast]:
+        """
+        Returns:
+        Example:
+        ```python
+        >>> from transformers import AutoProcessor, MllamaTextModel
+        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
+        >>> model = MllamaTextModel.from_pretrained(checkpoint)
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> text = "<|image|>If I had to write a haiku for this one"
+        >>> inputs = processor(text=text, return_tensors="pt")
+        >>> output = model(**inputs)
+        >>> print(output.last_hidden_state.shape)
+        torch.Size([1, 13, 4096])
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        #if (input_ids is None) ^ (inputs_embeds is not None):
+        #    raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+        #if inputs_embeds is None:
+        #    inputs_embeds = self.embed_tokens(input_ids=input_ids, audio_features=audio_features)
+        hidden_states = inputs_embeds
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = None
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            # For text-only path we should skip cross attention layers.
+            # Let's check if the layer is cross attention layer and if we have cross attention states
+            # or cached cross attention states.
+            is_cross_attention_layer = idx in self.cross_attention_layers
+            is_cross_attention_cache_empty = past_key_values is None or (
+                past_key_values is not None and past_key_values.get_seq_length(idx) == 0
+            )
+            if is_cross_attention_layer and cross_attention_states is None and is_cross_attention_cache_empty:
+                continue
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    cross_attention_states,
+                    cross_attention_mask,
+                    causal_mask,
+                    full_text_row_masked_out_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                    position_embeddings,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    cross_attention_states=cross_attention_states,
+                    cross_attention_mask=cross_attention_mask,
+                    attention_mask=causal_mask,
+                    full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                    position_embeddings=position_embeddings,
+                )
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+        hidden_states = self.norm(hidden_states)
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+        dtype, device = input_tensor.dtype, input_tensor.device
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            device=device,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+        return causal_mask
+    @staticmethod
+    # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+        return causal_mask
+class Llama3ForCausalLM(MllamaPreTrainedModel, GenerationMixin):
+    config_class = MllamaTextConfig
+    base_model_prefix = "model"
+    #_tied_weights_keys = ["lm_head.weight"]
+    def __init__(self, config: MllamaTextConfig):
+        super().__init__(config.get_text_config())
+        self.text_config = config.get_text_config()
+        self.vocab_size = self.text_config.vocab_size
+        self.model = Llama3TextModel._from_config(self.text_config, attn_implementation=config._attn_implementation)
+        self.lm_head = nn.Linear(self.text_config.hidden_size, self.vocab_size, bias=False)
+        self.post_init()
+    def get_input_embeddings(self):
+        #return self.model.embed_tokens.text_embeddings
+        return None
+    def set_input_embeddings(self, value):
+        #self.model.embed_tokens.text_embeddings = value
+        pass
+    def get_output_embeddings(self):
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+    def set_decoder(self, decoder):
+        self.model = decoder
+    def get_decoder(self):
+        return self.model
+    def forward(
+        self,
+        #input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        cross_attention_states: Optional[torch.LongTensor] = None,
+        cross_attention_mask: Optional[torch.LongTensor] = None,
+        full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+        **loss_kwargs,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            #input_ids=input_ids,
+            cross_attention_states=cross_attention_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]).float()
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+#AutoModelForCausalLM.register(MllamaTextConfig, Llama3ForCausalLM)
+#transformers.Llama3ForCausalLM = Llama3ForCausalLM
+class Llama3ForConditionalGeneration(MllamaPreTrainedModel, GenerationMixin):
+    config_class = Llama3Config
+    base_model_prefix = "model"
+    _supports_quantized_cache = False  # quant cache not supported in encoder-decoder setting
+    def __init__(self, config: Llama3Config):
+        super().__init__(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.hidden_size = config.text_config.hidden_size
+        self.max_num_tiles = config.vision_config.max_num_tiles
+        self.vision_output_dim = config.vision_config.vision_output_dim
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.vision_model = MllamaVisionModel._from_config(config.vision_config)
+        self.language_model = MllamaForCausalLM._from_config(config.text_config)
+        self.language_model.get_input_embeddings().weight.required_grad = False
+        self.embed_tokens = Llama3Embedding(config.audio_config, config.text_config)
+        self.multi_modal_projector = nn.Linear(
+            config.vision_config.vision_output_dim,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+        self.post_init()
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        audio_features: Optional[torch.FloatTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        aspect_ratio_mask: Optional[torch.Tensor] = None,
+        aspect_ratio_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_states: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+        Returns:
+        Example:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
+        >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
+        >>> model = MllamaForConditionalGeneration.from_pretrained(checkpoint)
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> prompt = "<|image|>If I had to write a haiku for this one"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+        >>> # Generate
+        >>> output = model.generate(**inputs, max_new_tokens=15)
+        >>> prompt_len = inputs.input_ids.shape[-1]
+        >>> generated_ids = output[:, prompt_len:]
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        >>> print(generated_text)
+        [', it would be:.\\nA stop sign in Chinatown.\\n']
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.text_config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.text_config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.text_config.use_return_dict
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if pixel_values is not None and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
+            )
+        if pixel_values is not None and cross_attention_states is not None:
+            raise ValueError("`pixel_values` and `cross_attention_states` cannot be provided simultaneously")
+        if pixel_values is not None:
+            if aspect_ratio_ids is None:
+                raise ValueError("`aspect_ratio_ids` must be provided if `pixel_values` is provided")
+            # get vision tokens from vision model
+            vision_outputs = self.vision_model(
+                pixel_values=pixel_values,
+                aspect_ratio_ids=aspect_ratio_ids,
+                aspect_ratio_mask=aspect_ratio_mask,
+                output_hidden_states=output_hidden_states,
+                output_attentions=output_attentions,
+                return_dict=return_dict,
+            )
+            cross_attention_states = vision_outputs[0]
+            cross_attention_states = self.multi_modal_projector(cross_attention_states).reshape(
+                -1, cross_attention_states.shape[-2], self.hidden_size
+            )
+        if cross_attention_mask is not None:
+            cross_attention_mask, full_text_row_masked_out_mask = _prepare_cross_attention_mask(
+                cross_attention_mask,
+                num_vision_tokens=self.vision_model.num_patches,
+                dtype=self.dtype,
+            )
+        else:
+            full_text_row_masked_out_mask = None
+        if cross_attention_mask is not None and cache_position is not None:
+            cross_attention_mask = cross_attention_mask[:, :, cache_position]
+            full_text_row_masked_out_mask = full_text_row_masked_out_mask[:, :, cache_position]
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids=input_ids, audio_features=audio_features)
+        outputs = self.language_model(
+            #input_ids=input_ids,
+            #audio_features=audio_features,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cross_attention_states=cross_attention_states,
+            cross_attention_mask=cross_attention_mask,
+            full_text_row_masked_out_mask=full_text_row_masked_out_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        return outputs
+    def prepare_inputs_for_generation(
+        self,
+        input_ids=None,
+        audio_features=None,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_ids=None,
+        pixel_values=None,
+        aspect_ratio_ids=None,
+        aspect_ratio_mask=None,
+        cross_attention_mask=None,
+        past_key_values=None,
+        use_cache=False,
+        cache_position=None,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+        # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+        # Exception 1: when passing input_embeds, input_ids may be missing entries
+        # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+        if past_key_values is not None:
+            if inputs_embeds is not None:  # Exception 1
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        # TODO: we have no attention_mask so this won't work, check if we really won't need attention mask and find another way
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+                # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s  `mode="reduce-overhead`, as otherwise the input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture.
+                position_ids = position_ids.clone(memory_format=torch.contiguous_format)
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and cache_position[0] == 0:
+            model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None}
+        else:
+            # The clone here is for the same reason as for `position_ids`.
+            model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None}
+        if num_logits_to_keep is not None:
+            model_inputs["num_logits_to_keep"] = num_logits_to_keep
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "cache_position": cache_position,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "cross_attention_mask": cross_attention_mask,
+            }
+        )
+        # If we're in pre-fill or cacheless decoding step, then we need pixel_values and aspect ratios
+        # to compute image hidden states, otherwise they are cached within each cross attn layer
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["aspect_ratio_ids"] = aspect_ratio_ids
+            model_inputs["aspect_ratio_mask"] = aspect_ratio_mask
+        return model_inputs
+    def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs):
+        cross_attention_mask_prev = model_kwargs.get("cross_attention_mask", None)
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+        # add cross-attn mask for new token
+        if cross_attention_mask_prev is not None:
+            model_kwargs["cross_attention_mask"] = torch.cat(
+                [cross_attention_mask_prev, cross_attention_mask_prev[:, -1:, ...]], dim=1
+            )
+        return model_kwargs
+#AutoModel.register(Llama3Config, Llama3ForConditionalGeneration)
+#transformers.Llama3ForConditionalGeneration = Llama3ForConditionalGeneration

processing_mllama.py ADDED Viewed

	@@ -0,0 +1,377 @@

+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for Mllama."""
+from typing import List, Optional, Union
+import numpy as np
+import torch
+import transformers
+from transformers import AutoProcessor
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput
+from transformers.processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, Unpack, AudioKwargs
+from transformers.tokenization_utils_base import (
+    PreTokenizedInput,
+    TextInput,
+    AudioInput,
+)
+# TODO: Can we do it that way or its better include as "Copied from ..."
+from transformers.models.mllama.image_processing_mllama import make_list_of_images
+from .audio_processing_mllama import build_audio_tokens
+class MllamaImagesKwargs(ImagesKwargs, total=False):
+    max_image_tiles: Optional[int]
+class MllamaProcessorKwargs(ProcessingKwargs, total=False):
+    images_kwargs: MllamaImagesKwargs
+    _defaults = {
+        "image_kwargs": {
+            "max_image_tiles": 4,
+        },
+    }
+def get_cross_attention_token_mask(input_ids: List[int], image_token_id: int) -> List[List[int]]:
+    """
+    Generate a cross-attention token mask for image tokens in the input sequence.
+    This function identifies the positions of image tokens in the input sequence and creates
+    a mask that defines which subsequent tokens each image token should attend to.
+    Args:
+        input_ids (List[int]): A list of token ids representing the input sequence.
+        image_token_id (int): The id of the token used to represent images in the sequence.
+    Returns:
+        List[List[int]]: A list of [start, end] pairs, where each pair represents the range
+        of tokens an image token should attend to.
+    Notes:
+        - If no image tokens are present, an empty list is returned.
+        - For a single image token, it attends to all subsequent tokens until the end of the sequence.
+        - For multiple image tokens, each attends to tokens up to the next image token or the end of the sequence.
+        - Consecutive image tokens are treated as a group and attend to all subsequent tokens together.
+    """
+    image_token_locations = [i for i, token in enumerate(input_ids) if token == image_token_id]
+    if len(image_token_locations) == 0:
+        return []
+    # only one image present, unmask until end of sequence
+    if len(image_token_locations) == 1:
+        return [[image_token_locations[0], -1]]
+    vision_masks = [[loc1, loc2] for loc1, loc2 in zip(image_token_locations[:-1], image_token_locations[1:])]
+    # last image will attend to all subsequent text
+    vision_masks.append([image_token_locations[-1], len(input_ids)])
+    # if there are two or more consecutive vision tokens,
+    # they should all attend to all subsequent
+    # text present
+    last_mask_end = vision_masks[-1][1]
+    for vision_mask in vision_masks[::-1]:
+        if vision_mask[0] == vision_mask[1] - 1:
+            vision_mask[1] = last_mask_end
+        last_mask_end = vision_mask[1]
+    return vision_masks
+def convert_sparse_cross_attention_mask_to_dense(
+    cross_attention_token_mask: List[List[List[int]]],
+    num_tiles: List[List[int]],
+    max_num_tiles: int,
+    length: int,
+) -> np.ndarray:
+    """
+    Convert the cross attention mask indices to a cross attention mask 4D array.
+    This function takes a sparse representation of cross attention masks and converts it to a dense 4D numpy array.
+    The sparse representation is a nested list structure that defines attention ranges for each image in each batch item.
+    Args:
+        cross_attention_token_mask (List[List[List[int]]]): A nested list structure where:
+            - The outer list represents the batch dimension.
+            - The middle list represents different images within each batch item.
+            - The inner list contains pairs of integers [start, end] representing token ranges for each image.
+        num_tiles (List[List[int]]): A nested list structure specifying the number of tiles for each image in each batch item.
+        max_num_tiles (int): The maximum possible number of tiles.
+        length (int): The total sequence length of the input.
+    Returns:
+        np.ndarray: A 4D numpy array of shape (batch_size, length, max_num_images, max_num_tiles)
+            The array contains `1` where attention is allowed and `0` where it is not.
+    Note:
+        - Special handling is done for cases where the end token is -1, which is interpreted as attending to the end of the sequence.
+    """
+    batch_size = len(cross_attention_token_mask)
+    max_num_images = max([len(masks) for masks in cross_attention_token_mask])
+    cross_attention_mask = np.zeros(
+        shape=(batch_size, length, max_num_images, max_num_tiles),
+        dtype=np.int64,
+    )
+    for sample_idx, (sample_masks, sample_num_tiles) in enumerate(zip(cross_attention_token_mask, num_tiles)):
+        for mask_idx, (locations, mask_num_tiles) in enumerate(zip(sample_masks, sample_num_tiles)):
+            if len(locations) == 2:
+                start, end = locations
+                end = min(end, length)
+                if end == -1:
+                    end = length
+                cross_attention_mask[sample_idx, start:end, mask_idx, :mask_num_tiles] = 1
+    return cross_attention_mask
+def build_string_from_input(prompt: str, bos_token: str, image_token: str) -> str:
+    """
+    Builds a string from the input prompt by adding `bos_token` if not already present.
+    Args:
+        prompt (`str`):
+            The input prompt string.
+        bos_token (`str`):
+            The beginning of sentence token to be added.
+        image_token (`str`):
+            The image token used to identify the start of an image sequence.
+    Returns:
+        str: The modified prompt string with the `bos_token` added if necessary.
+    Examples:
+        >>> build_string_from_input("Hello world", "<begin_of_text>", "<|image|>")
+        '<begin_of_text>Hello world'
+        >>> build_string_from_input("<|image|>Hello world", "<begin_of_text>", "<|image|>")
+        '<|image|><begin_of_text>Hello world'
+        >>> build_string_from_input("<begin_of_text>Hello world", "<begin_of_text>", "<|image|>")
+        '<begin_of_text>Hello world'
+    """
+    if bos_token in prompt:
+        return prompt
+    num_image_tokens_on_start = 0
+    while prompt.startswith(image_token):
+        prompt = prompt[len(image_token) :]
+        num_image_tokens_on_start += 1
+    return f"{image_token * num_image_tokens_on_start}{bos_token}{prompt}"
+class MllamaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Mllama processor which wraps [`MllamaImageProcessor`] and
+    [`PretrainedTokenizerFast`] into a single processor that inherits both the image processor and
+    tokenizer functionalities. See the [`~MllamaProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more
+    information.
+    The preferred way of passing kwargs is as a dictionary per modality, see usage example below.
+        ```python
+        from transformers import MllamaProcessor
+        from PIL import Image
+        processor = MllamaProcessor.from_pretrained("meta-llama/Llama-3.2-11B-Vision")
+        processor(
+            images=your_pil_image,
+            text=["<|image|>If I had to write a haiku for this one"],
+            images_kwargs = {"size": {"height": 448, "width": 448}},
+            text_kwargs = {"padding": "right"},
+            common_kwargs = {"return_tensors": "pt"},
+        )
+        ```
+    Args:
+        image_processor ([`MllamaImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+    attributes = ["image_processor", "audio_processor", "tokenizer"]
+    image_processor_class = "MllamaImageProcessor"
+    audio_processor_class = "MllamaAudioFeatureExtractor"
+    tokenizer_class = "PreTrainedTokenizerFast"
+    def __init__(self, image_processor, audio_processor, tokenizer):
+        self.image_token = "<|image|>"
+        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        self.audio_token = "<|audio|>"
+        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
+        self.python_token = "<|python_tag|>"
+        self.python_token_id = tokenizer.convert_tokens_to_ids(self.python_token)
+        self.bos_token = tokenizer.bos_token
+        self.chat_template = tokenizer.chat_template
+        super().__init__(image_processor, audio_processor, tokenizer)
+        self.tokenizer.add_tokens([f"<|audio_{i}|>" for i in range(1, 50)])
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
+        audio: Optional[Union[AudioInput, List[AudioInput]]] = None,
+        videos=None,
+        **kwargs: Unpack[MllamaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
+        arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` arguments to
+        MllamaImageProcessor's [`~MllamaImageProcessor.__call__`] if `images` is not `None`. Please refer
+        to the docstring of the above two methods for more information.
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **audio_features** -- Audio features extracted using SeamlessM4TFeatureExtractor. Returned when `audio` is not `None`.
+            TODO: add aspect_ratio_ids and aspect_ratio_mask and cross_attention_mask
+        """
+        if text is None:
+            raise ValueError("You must specify text.")
+        output_kwargs = self._merge_kwargs(
+            MllamaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        text_kwargs = output_kwargs["text_kwargs"]
+        images_kwargs = output_kwargs["images_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+        data = {}
+        if audio is not None:
+            audio_features = self.audio_processor(audio)
+            data.update(audio_features)
+        if isinstance(text, str):
+            text = [text]
+        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+        n_images_in_text = [t.count(self.image_token) for t in text]
+        text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
+        _ = text_kwargs.pop("padding_side", None)  # hack until padding-side is an accepted kwarg by tokenizers
+        if audio is not None:
+            text = build_audio_tokens(text, audio_features, self.audio_token)
+        encoding = self.tokenizer(text, add_special_tokens=False, **text_kwargs)
+        if audio is not None:
+            beg_audio_id = self.tokenizer.convert_tokens_to_ids("<|audio_1|>")
+            idx = torch.where(encoding['input_ids'] >= beg_audio_id)
+            encoding['input_ids'][idx] = beg_audio_id - encoding['input_ids'][idx] - 1
+        data.update(encoding)
+        n_images_in_images = [0]
+        if images is not None:
+            images = make_list_of_images(images)
+            n_images_in_images = [len(sample) for sample in images]
+        if text is not None:
+            if any(batch_img == 0 for batch_img in n_images_in_text) and not all(
+                batch_img == 0 for batch_img in n_images_in_text
+            ):
+                raise ValueError(
+                    "If a batch of text is provided, there should be either no images or at least one image per sample"
+                )
+            if sum(n_images_in_images) != sum(n_images_in_text):
+                if images is None:
+                    raise ValueError("No image were provided, but there are image tokens in the prompt")
+                else:
+                    raise ValueError(
+                        f"The number of image token ({sum(n_images_in_text)}) should be the same as in the number of provided images ({sum(n_images_in_images)})"
+                    )
+        if images is not None:
+            image_features = self.image_processor(images, **images_kwargs)
+            num_tiles = image_features.pop("num_tiles")
+            data.update(image_features)
+        # Create cross attention mask
+        if images is not None and text is not None:
+            cross_attention_token_mask = [
+                get_cross_attention_token_mask(token_ids, self.image_token_id) for token_ids in encoding["input_ids"]
+            ]
+            cross_attention_mask = convert_sparse_cross_attention_mask_to_dense(
+                cross_attention_token_mask,
+                num_tiles=num_tiles,
+                max_num_tiles=self.image_processor.max_image_tiles,
+                length=max(len(input_ids) for input_ids in encoding["input_ids"]),
+            )
+            data["cross_attention_mask"] = cross_attention_mask
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        batch_feature = BatchFeature(data=data, tensor_type=return_tensors)
+        return batch_feature
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        audio_processor_input_names = self.audio_processor.model_input_names
+        return list(tokenizer_input_names +
+                    image_processor_input_names +
+                    ["cross_attention_mask"] +
+                    audio_processor_input_names)
+AutoProcessor.register("MllamaProcessor", MllamaProcessor)
+transformers.MllamaProcessor = MllamaProcessor