# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Classes to support Flax Speech-Encoder-Decoder architectures""" import os from functools import partial from typing import Optional, Tuple, Union, Dict import flax import flax.linen as nn import jax import jax.numpy as jnp from flax.core.frozen_dict import FrozenDict, unfreeze from jax import lax from jax.random import PRNGKey import numpy as np from transformers.modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutputWithCrossAttentions, FlaxSeq2SeqLMOutput from transformers.modeling_flax_utils import FlaxPreTrainedModel from transformers.utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings, ModelOutput from transformers.generation_flax_utils import FlaxLogitsProcessorList from models import ( FlaxWav2Vec2Model, FlaxWav2Vec2Module, FlaxBartForCausalLM, FlaxBartForCausalLMModule, BartConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, ) logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "SpeechEncoderDecoderConfig" SPEECH_ENCODER_DECODER_START_DOCSTRING = r""" This class can be used to initialize a speech-sequence-to-text-sequence model with any pretrained speech autoencoding model as the encoder and any pretrained text autoregressive model as the decoder. The encoder is loaded via [`~AutoModel.from_pretrained`] function and the decoder is loaded via [`~AutoModelForCausalLM.from_pretrained`] function. Cross-attention layers are automatically added to the decoder and should be fine-tuned on a downstream generative task, like summarization. The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn. Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. Additionally, in [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) it is shown how leveraging large pretrained speech models for speech translation yields a significant performance improvement. After such an Speech-Encoder Decoder model has been trained/fine-tuned, it can be saved/loaded just like any other models (see the examples for more information). This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a Flax Linen [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. Parameters: config ([`SpeechEncoderDecoderConfig`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and `jax.numpy.bfloat16` (on TPUs). This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If specified all the computation will be performed with the given `dtype`. **Note that this only specifies the dtype of the computation and does not influence the dtype of model parameters.** If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and [`~FlaxPreTrainedModel.to_bf16`]. """ SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING = r""" Args: inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type *torch.FloatTensor*. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`. decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.max_position_embeddings - 1]`. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): If set to `True`, the model will return a [`~utils.FlaxSeq2SeqLMOutput`] instead of a plain tuple. """ SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING = r""" Args: inputs (`jnp.ndarray` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, feature_dim)`, *optional*): Float values of input raw speech waveform or speech features. Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type *List[float]* or a *numpy.ndarray*, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array into *inputs*, either the [`Wav2Vec2Processor`] or [`Speech2TextProcessor`] should be used for padding and conversion into a tensor of type *torch.FloatTensor*. attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): If set to `True`, the model will return a [`~utils.FlaxBaseModelOutput`] instead of a plain tuple. """ SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING = r""" Args: decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using [`PreTrainedTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). For sequence to sequence training, `decoder_input_ids` should be provided. `decoder_input_ids` should be created outside of the model by shifting the `labels` to the right, replacing -100 by the `pad_token_id` and prepending them with the `decoder_start_token_id`. encoder_outputs (`tuple(tuple(jnp.ndarray)`): Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also be used by default. decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the range `[0, config.decoder.max_position_embeddings - 1]`. past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): If set to `True`, the model will return a [`~utils.FlaxCausalLMOutputWithCrossAttentions`] instead of a plain tuple. """ @flax.struct.dataclass class FlaxBeamSearchOutput(ModelOutput): """ Flax Base class for outputs of decoder-only generation models using greedy search. Args: sequences (`jnp.ndarray` of shape `(batch_size, max_length)`): The generated sequences. scores (`jnp.ndarray` of shape `(batch_size,)`): The scores (log probabilites) of the generated sequences. """ sequences: jnp.ndarray = None scores: jnp.ndarray = None @flax.struct.dataclass class BeamSearchState: cur_len: jnp.ndarray running_sequences: jnp.ndarray running_scores: jnp.ndarray sequences: jnp.ndarray scores: jnp.ndarray is_sent_finished: jnp.ndarray model_kwargs: Dict[str, jnp.ndarray] class FlaxSpeechEncoderDecoderModule(nn.Module): config: SpeechEncoderDecoderConfig dtype: jnp.dtype = jnp.float32 def setup(self): encoder_config = self.config.encoder decoder_config = self.config.decoder # TODO: configure FlaxAutoModel mappings (required when trialling different encoder-decoder combinations) encoder_module = FlaxWav2Vec2Module decoder_module = FlaxBartForCausalLMModule self.encoder = encoder_module(encoder_config, dtype=self.dtype) self.decoder = decoder_module(decoder_config, dtype=self.dtype) # encoder outputs might need to be projected to different dimension for decoder if ( self.encoder.config.hidden_size != self.decoder.config.hidden_size and self.decoder.config.cross_attention_hidden_size is None ): self.enc_to_dec_proj = nn.Dense( self.decoder.config.hidden_size, kernel_init=jax.nn.initializers.normal(self.decoder.config.initializer_range), dtype=self.dtype, ) else: self.enc_to_dec_proj = None def _get_feat_extract_output_lengths( self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None ): """ Computes the output length of the convolutional layers """ add_adapter = self.config.encoder.add_adapter if add_adapter is None else add_adapter def _conv_out_length(input_length, kernel_size, stride): # 1D convolutional layer output length formula taken # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html return (input_length - kernel_size) // stride + 1 for kernel_size, stride in zip(self.config.encoder.conv_kernel, self.config.encoder.conv_stride): input_lengths = _conv_out_length(input_lengths, kernel_size, stride) if add_adapter: for _ in range(self.config.encoder.num_adapter_layers): input_lengths = _conv_out_length(input_lengths, 1, self.config.encoder.adapter_stride) return input_lengths def _get_encoder_module(self): return self.encoder def _get_projection_module(self): return self.enc_to_dec_proj def _get_decoder_module(self): return self.decoder def __call__( self, inputs, attention_mask, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_outputs=None, extract_features=None, output_attentions: bool = False, output_hidden_states: bool = False, output_features: bool = False, return_dict: bool = True, deterministic: bool = True, freeze_feature_encoder: bool = False, ): if encoder_outputs is None: encoder_outputs = self.encoder( inputs, attention_mask=attention_mask, extract_features=extract_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_features=output_features, return_dict=return_dict, deterministic=deterministic, freeze_feature_encoder=freeze_feature_encoder, ) if output_features: return encoder_outputs encoder_hidden_states = encoder_outputs[0] # optionally project encoder_hidden_states if self.enc_to_dec_proj is not None: encoder_hidden_states = self.enc_to_dec_proj(encoder_hidden_states) # compute correct encoder attention mask if attention_mask is not None: encoder_attention_mask = self.encoder._get_feature_vector_attention_mask( encoder_hidden_states.shape[1], attention_mask ) else: encoder_attention_mask = None # flax script modeling_flax_wav2vec2.py decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, position_ids=decoder_position_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, deterministic=deterministic, ) if not return_dict: return decoder_outputs + encoder_outputs return FlaxSeq2SeqLMOutput( logits=decoder_outputs.logits, decoder_hidden_states=decoder_outputs.hidden_states, decoder_attentions=decoder_outputs.attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_hidden_states, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, ) @add_start_docstrings(SPEECH_ENCODER_DECODER_START_DOCSTRING) class FlaxSpeechEncoderDecoderModel(FlaxPreTrainedModel): r""" [`FlaxSpeechEncoderDecoderModel`] is a generic model class that will be instantiated as a transformer architecture with the module (flax.nn.Module) of one of the base model classes of the library as encoder module and another one as decoder module when created with the :meth*~transformers.FlaxAutoModel.from_pretrained* class method for the encoder and :meth*~transformers.FlaxAutoModelForCausalLM.from_pretrained* class method for the decoder. """ config_class = SpeechEncoderDecoderConfig base_model_prefix: str = "speech_encoder_decoder" module_class = FlaxSpeechEncoderDecoderModule def __init__( self, config: SpeechEncoderDecoderConfig, input_shape: Optional[Tuple] = None, seed: int = 0, dtype: jnp.dtype = jnp.float32, _do_init: bool = True, **kwargs ): if not _do_init: raise ValueError( "`FlaxSpeechEncoderDecoderModel` cannot be created without initializing, `_do_init` must be `True`." ) if config.decoder.cross_attention_hidden_size is not None: # Raise ValueError or option to project enc to dec hidden_size (eg EncAdapterLayer) if config.decoder.cross_attention_hidden_size != config.encoder.hidden_size: raise ValueError( "If `cross_attention_hidden_size` is specified in the decoder's configuration, " "it has to be equal to the encoder's `hidden_size`. " f"Got {config.decoder.cross_attention_hidden_size} for `config.decoder.cross_attention_hidden_size` " f"and {config.encoder.hidden_size} for `config.encoder.hidden_size`." ) # make sure input & output embeddings are not tied config.tie_word_embeddings = False module = self.module_class(config=config, dtype=dtype, **kwargs) if input_shape is None: # speech encoders almost always downsample the sequence length dimension encoder_input_length = 1024 decoder_input_length = module._get_feat_extract_output_lengths(encoder_input_length) input_shape = ((1, encoder_input_length), (1, decoder_input_length)) super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple) -> FrozenDict: encoder_input_shape, decoder_input_shape = input_shape # init input DeviceArrays inputs = jnp.zeros(encoder_input_shape, dtype="f4") attention_mask = jnp.ones_like(inputs, dtype="i4") decoder_input_ids = jnp.zeros(decoder_input_shape, dtype="i4") decoder_attention_mask = jnp.ones_like(decoder_input_ids) batch_size, sequence_length = inputs.shape decoder_batch_size, decoder_sequence_length = decoder_input_ids.shape if not decoder_batch_size == batch_size: raise ValueError( f"The inputs of encoder and decoder should have the same batch size, but got {batch_size} for encoder and {decoder_batch_size} for decoder." ) decoder_position_ids = jnp.broadcast_to( jnp.arange(decoder_sequence_length)[None, :], (decoder_batch_size, decoder_sequence_length) ) params_rng, dropout_rng = jax.random.split(rng) rngs = {"params": params_rng, "dropout": dropout_rng} return self.module.init( rngs, inputs, attention_mask, decoder_input_ids, decoder_attention_mask, decoder_position_ids, )["params"] def init_cache(self, batch_size, max_length, encoder_outputs): r""" Args: batch_size (`int`): batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. max_length (`int`): maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized cache. encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. """ # init input variables to retrieve cache decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") decoder_attention_mask = jnp.ones_like(decoder_input_ids) decoder_position_ids = jnp.broadcast_to( jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape ) def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): decoder_module = module._get_decoder_module() return decoder_module( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, position_ids=decoder_position_ids, **kwargs, ) init_variables = self.module.init( jax.random.PRNGKey(0), decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, decoder_position_ids=decoder_position_ids, encoder_hidden_states=encoder_outputs[0], init_cache=True, method=_decoder_forward, # we only need to call the decoder to init the cache ) return unfreeze(init_variables["cache"]) def _get_feat_extract_output_lengths( self, input_lengths: Union[jnp.ndarray, int], add_adapter: Optional[bool] = None ): return self.module._get_feat_extract_output_lengths(input_lengths, add_adapter=add_adapter) @add_start_docstrings(SPEECH_ENCODER_DECODER_ENCODE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=_CONFIG_FOR_DOC) def encode( self, inputs: jnp.ndarray, attention_mask: Optional[jnp.ndarray] = None, extract_features: Optional[jnp.ndarray] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_features: Optional[bool] = None, return_dict: Optional[bool] = None, train: bool = False, freeze_feature_encoder: bool = False, params: dict = None, dropout_rng: PRNGKey = None, ): r""" Returns: Example: ```python >>> from transformers import FlaxSpeechEncoderDecoderModel >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained( ... "facebook/wav2vec2-large-lv60", "facebook/bart-large" ... ) >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32) >>> encoder_outputs = model.encode(inputs) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.return_dict if attention_mask is None: attention_mask = jnp.ones_like(inputs, dtype="i4") if extract_features is not None: extract_features = jnp.array(extract_features, dtype="f4") # Handle any PRNG if needed rngs = {} if dropout_rng is not None: rngs["dropout"] = dropout_rng def _encoder_forward(module, inputs, attention_mask, **kwargs): encode_module = module._get_encoder_module() return encode_module(inputs, attention_mask, **kwargs) outputs = self.module.apply( {"params": params or self.params}, inputs=jnp.array(inputs, dtype="f4"), attention_mask=jnp.array(attention_mask, dtype="i4"), extract_features=extract_features, output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_features=output_features, return_dict=return_dict, deterministic=not train, freeze_feature_encoder=freeze_feature_encoder, rngs=rngs, method=_encoder_forward, ) if return_dict and not output_features: outputs = FlaxBaseModelOutput( last_hidden_state=outputs.last_hidden_state, hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) return outputs @add_start_docstrings(SPEECH_ENCODER_DECODER_DECODE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) def decode( self, decoder_input_ids, encoder_outputs, encoder_attention_mask: Optional[jnp.ndarray] = None, decoder_attention_mask: Optional[jnp.ndarray] = None, decoder_position_ids: Optional[jnp.ndarray] = None, past_key_values: dict = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, train: bool = False, params: dict = None, dropout_rng: PRNGKey = None, ): r""" Returns: Example: ```python >>> from transformers import FlaxSpeechEncoderDecoderModel >>> import jax.numpy as jnp >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained( ... "facebook/wav2vec2-large-lv60", "facebook/bart-large" ... ) >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32) >>> encoder_outputs = model.encode(inputs) >>> decoder_start_token_id = model.config.decoder.bos_token_id >>> decoder_input_ids = jnp.ones((inputs.shape[0], 1), dtype="i4") * decoder_start_token_id >>> outputs = model.decode(decoder_input_ids, encoder_outputs) >>> logits = outputs.logits ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.return_dict encoder_hidden_states = encoder_outputs[0] if encoder_attention_mask is None: batch_size, sequence_length = encoder_hidden_states.shape[:2] encoder_attention_mask = jnp.ones((batch_size, sequence_length)) batch_size, sequence_length = decoder_input_ids.shape if decoder_attention_mask is None: decoder_attention_mask = jnp.ones((batch_size, sequence_length)) if decoder_position_ids is None: if past_key_values is not None: raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") decoder_position_ids = jnp.broadcast_to( jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) ) # Handle any PRNG if needed rngs = {} if dropout_rng is not None: rngs["dropout"] = dropout_rng params = {"params": params or self.params} # if past_key_values are passed then cache is already initialized a private flag init_cache has to be # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that # it can be changed by FlaxBartAttention module if past_key_values: params["cache"] = past_key_values mutable = ["cache"] else: mutable = False def _decoder_forward( module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs ): projection_module = module._get_projection_module() decoder_module = module._get_decoder_module() # optionally project encoder_hidden_states if projection_module is not None: encoder_hidden_states = projection_module(encoder_hidden_states) return decoder_module( decoder_input_ids, decoder_attention_mask, decoder_position_ids, encoder_hidden_states, **kwargs, ) outputs = self.module.apply( params, decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, deterministic=not train, rngs=rngs, mutable=mutable, method=_decoder_forward, ) # add updated cache to model output if past_key_values is not None and return_dict: outputs, past = outputs outputs["past_key_values"] = unfreeze(past["cache"]) return outputs elif past_key_values is not None and not return_dict: outputs, past = outputs outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] return outputs @add_start_docstrings_to_model_forward(SPEECH_ENCODER_DECODER_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) def __call__( self, inputs: jnp.ndarray, attention_mask: Optional[jnp.ndarray] = None, extract_features: Optional[jnp.ndarray] = None, decoder_input_ids: Optional[jnp.ndarray] = None, decoder_attention_mask: Optional[jnp.ndarray] = None, decoder_position_ids: Optional[jnp.ndarray] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, output_features: Optional[bool] = None, return_dict: Optional[bool] = None, train: bool = False, freeze_feature_encoder: bool = False, params: dict = None, dropout_rng: PRNGKey = None, ): r""" Returns: Examples: ```python >>> from transformers import FlaxSpeechEncoderDecoderModel, BartTokenizer >>> # load a fine-tuned wav2vec2-2-bart model >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("patrickvonplaten/wav2vec2-2-bart-large") >>> # load output tokenizer >>> tokenizer_output = BartTokenizer.from_pretrained("facebook/bart-large") >>> inputs = jnp.ones((2, 5000), dtype=jnp.float32) >>> # use bart's special bos, pad and eos tokens >>> model.config.decoder_start_token_id = model.decoder.config.bos_token_id >>> model.config.pad_token_id = model.decoder.config.pad_token_id >>> model.config.eos_token_id = model.decoder.config.eos_token_id >>> outputs = model.generate(inputs) # Assert something? More interesting input? dtype correct? ``` """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.return_dict # prepare encoder inputs if attention_mask is None: attention_mask = jnp.ones_like(inputs, dtype="i4") if extract_features is not None: inputs = None # we can omit passing the inputs to the model to save memory extract_features = jnp.array(extract_features, dtype="f4") else: inputs = jnp.array(inputs, dtype="f4") # prepare decoder inputs if decoder_input_ids is None: raise ValueError( "`decoder_input_ids` cannot be `None`. For sequence to sequence training, `decoder_position_ids` must be specified as an input argument." ) if decoder_attention_mask is None: decoder_attention_mask = jnp.ones_like(decoder_input_ids) if decoder_position_ids is None: batch_size, sequence_length = decoder_input_ids.shape decoder_position_ids = jnp.broadcast_to( jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) ) # Handle any PRNG if needed rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} return self.module.apply( {"params": params or self.params}, inputs=inputs, attention_mask=jnp.array(attention_mask, dtype="i4"), extract_features=extract_features, decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), output_attentions=output_attentions, output_hidden_states=output_hidden_states, output_features=output_features, return_dict=return_dict, deterministic=not train, freeze_feature_encoder=freeze_feature_encoder, rngs=rngs, ) def prepare_inputs_for_generation( self, decoder_input_ids, max_length, attention_mask: Optional[jnp.DeviceArray] = None, decoder_attention_mask: Optional[jnp.DeviceArray] = None, encoder_outputs=None, **kwargs ): # initializing the cache batch_size, seq_length = decoder_input_ids.shape past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) # Note that usually one would have to put 0's in the attention_mask for x > input.shape[-1] and x < cache_length. # But since the decoder uses a causal mask, those positions are masked anyways. # Thus we can create a single static attention_mask here, which is more efficient for compilation extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") if decoder_attention_mask is not None: decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) else: decoder_position_ids = jnp.broadcast_to( jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length) ) return { "past_key_values": past_key_values, "encoder_outputs": encoder_outputs, "encoder_attention_mask": attention_mask, "decoder_attention_mask": extended_attention_mask, "decoder_position_ids": decoder_position_ids, } def update_inputs_for_generation(self, model_outputs, model_kwargs): model_kwargs["past_key_values"] = model_outputs.past_key_values model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 return model_kwargs @classmethod def from_encoder_decoder_pretrained( cls, encoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, decoder_pretrained_model_name_or_path: Optional[Union[str, os.PathLike]] = None, *model_args, **kwargs ) -> FlaxPreTrainedModel: r""" Instantiate an encoder and a decoder from one or two base classes of the library from pretrained model checkpoints. Params: encoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*): Information necessary to initiate the encoder. Can be either: - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - A path to a *directory* containing model weights saved using [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. decoder_pretrained_model_name_or_path (`Union[str, os.PathLike]`, *optional*, defaults to `None`): Information necessary to initiate the decoder. Can be either: - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a user or organization name, like `dbmdz/bert-base-german-cased`. - A path to a *directory* containing model weights saved using [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. model_args (remaining positional arguments, *optional*): All remaning positional arguments will be passed to the underlying model's `__init__` method. kwargs (remaining dictionary of keyword arguments, *optional*): Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., `output_attentions=True`). - To update the encoder configuration, use the prefix *encoder_* for each configuration parameter. - To update the decoder configuration, use the prefix *decoder_* for each configuration parameter. - To update the parent model configuration, do not use a prefix for each configuration parameter. Behaves differently depending on whether a `config` is provided or automatically loaded. Example: ```python >>> from transformers import FlaxSpeechEncoderDecoderModel >>> # initialize a wav2vec2-2-bart from pretrained wav2vec2 and bart models. Note that the cross-attention layers will be randomly initialized >>> model = FlaxSpeechEncoderDecoderModel.from_encoder_decoder_pretrained( ... "facebook/wav2vec2-large-lv60", "facebook/bart-large" ... ) >>> # saving model after fine-tuning >>> model.save_pretrained("./wav2vec2-2-bart-large") >>> # load fine-tuned model >>> model = FlaxSpeechEncoderDecoderModel.from_pretrained("./wav2vec2-2-bart-large") ```""" kwargs_encoder = { argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } # remove encoder, decoder kwargs from kwargs for key in kwargs_encoder.keys(): del kwargs["encoder_" + key] for key in kwargs_decoder.keys(): del kwargs["decoder_" + key] # Load and initialize the encoder and decoder # The distinction between encoder and decoder at the model level is made # by the value of the flag `is_decoder` that we need to set correctly. encoder = kwargs_encoder.pop("model", None) if encoder is None: if encoder_pretrained_model_name_or_path is None: raise ValueError( "If `encoder_model` is not defined as an argument, a `encoder_pretrained_model_name_or_path` has " "to be defined." ) if "config" not in kwargs_encoder: # TODO: AutoConfig .from_pretrained encoder_config, kwargs_encoder = Wav2Vec2Config.from_pretrained( encoder_pretrained_model_name_or_path, **kwargs_encoder, return_unused_kwargs=True ) if encoder_config.is_decoder is True or encoder_config.add_cross_attention is True: logger.info( f"Initializing {encoder_pretrained_model_name_or_path} as a encoder model " "from a decoder model. Cross-attention and casual mask are disabled." ) encoder_config.is_decoder = False encoder_config.add_cross_attention = False kwargs_encoder["config"] = encoder_config # TODO: FlaxAutoModel .from_pretrained encoder = FlaxWav2Vec2Model.from_pretrained( encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder ) decoder = kwargs_decoder.pop("model", None) if decoder is None: if decoder_pretrained_model_name_or_path is None: raise ValueError( "If `decoder_model` is not defined as an argument, a `decoder_pretrained_model_name_or_path` has " "to be defined." ) if "config" not in kwargs_decoder: # TODO: AutoConfig .from_pretrained decoder_config, kwargs_decoder = BartConfig.from_pretrained( decoder_pretrained_model_name_or_path, **kwargs_decoder, return_unused_kwargs=True ) if decoder_config.is_decoder is False or decoder_config.add_cross_attention is False: logger.info( f"Initializing {decoder_pretrained_model_name_or_path} as a decoder model. " f"Cross attention layers are added to {decoder_pretrained_model_name_or_path} " f"and randomly initialized if {decoder_pretrained_model_name_or_path}'s architecture allows for " "cross attention layers." ) decoder_config.is_decoder = True decoder_config.add_cross_attention = True kwargs_decoder["config"] = decoder_config if kwargs_decoder["config"].is_decoder is False or kwargs_decoder["config"].add_cross_attention is False: logger.warning( f"Decoder model {decoder_pretrained_model_name_or_path} is not initialized as a decoder. " f"In order to initialize {decoder_pretrained_model_name_or_path} as a decoder, " "make sure that the attributes `is_decoder` and `add_cross_attention` of `decoder_config` " "passed to `.from_encoder_decoder_pretrained(...)` are set to `True` or do not pass a " "`decoder_config` to `.from_encoder_decoder_pretrained(...)`" ) # TODO: FlaxAutoModelForCausalLM .from_pretrained decoder = FlaxBartForCausalLM.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder) # instantiate config with corresponding kwargs dtype = kwargs.pop("dtype", jnp.float32) config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(encoder.config, decoder.config, **kwargs) # make sure input & output word embeddings are not tied config.tie_word_embeddings = False # init model model = cls(config, dtype=dtype) model.params["encoder"] = encoder.params model.params["decoder"] = decoder.params return model def _beam_search( self, input_ids: None, max_length: Optional[int] = None, pad_token_id: Optional[int] = None, eos_token_id: Optional[int] = None, length_penalty: Optional[float] = None, early_stopping: Optional[bool] = None, logits_processor: Optional[FlaxLogitsProcessorList] = None, trace: bool = True, params: Optional[Dict[str, jnp.ndarray]] = None, model_kwargs: Optional[Dict[str, jnp.ndarray]] = None, ): """ This beam search function is heavily inspired by Flax's official example: https://github.com/google/flax/blob/master/examples/wmt/train.py#L254 """ def flatten_beam_dim(tensor): """Flattens the first two dimensions of a non-scalar array.""" # ignore scalars (e.g. cache index) if tensor.ndim == 0 or tensor.ndim == 1: return tensor elif tensor.ndim == 6: return tensor.reshape(tensor.shape[:1] + (tensor.shape[1] * tensor.shape[2],) + tensor.shape[3:]) return tensor.reshape((tensor.shape[0] * tensor.shape[1],) + tensor.shape[2:]) def unflatten_beam_dim(tensor, batch_size, num_beams): """Unflattens the first, flat batch*beam dimension of a non-scalar array.""" # ignore scalars (e.g. cache index) if tensor.ndim == 0 or tensor.ndim == 1: return tensor if tensor.ndim == 5: return tensor.reshape(tensor.shape[:1] + (batch_size, num_beams) + tensor.shape[2:]) return tensor.reshape((batch_size, num_beams) + tensor.shape[1:]) def gather_beams(nested, beam_indices, batch_size, new_num_beams): """ Gathers the beam slices indexed by beam_indices into new beam array. """ batch_indices = jnp.reshape( jnp.arange(batch_size * new_num_beams) // new_num_beams, (batch_size, new_num_beams) ) def gather_fn(tensor): # ignore scalars (e.g. cache index) if tensor.ndim == 0 or tensor.ndim == 1: return tensor if tensor.ndim == 6: return tensor[:, batch_indices, beam_indices] return tensor[batch_indices, beam_indices] return jax.tree_map(gather_fn, nested) # init values max_length = max_length if max_length is not None else self.config.max_length pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_id = eos_token_id if eos_token_id is not None else self.config.eos_token_id length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty early_stopping = early_stopping if early_stopping is not None else self.config.early_stopping batch_size, num_beams, cur_len = input_ids.shape eos_token_id = jnp.array(eos_token_id) pad_token_id = jnp.array(pad_token_id) cur_len = jnp.array(cur_len) # per batch,beam-item holding current token in loop. sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32) running_sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32) running_sequences = lax.dynamic_update_slice(sequences, input_ids, (0, 0, 0)) # per batch,beam-item state bit indicating if sentence has finished. is_sent_finished = jnp.zeros((batch_size, num_beams), dtype=jnp.bool_) # per batch,beam-item score, logprobs running_scores = jnp.tile(jnp.array([0.0] + [np.array(-1.0e7)] * (num_beams - 1)), [batch_size, 1]) scores = jnp.ones((batch_size, num_beams)) * np.array(-1.0e7) # For Seq2Seq generation, we only need to use the decoder instead of the whole model in generation loop # and pass it the `encoder_outputs`, which are part of the `model_kwargs`. model = self.decode if self.config.is_encoder_decoder else self # flatten beam dim if "encoder_outputs" in model_kwargs: model_kwargs["encoder_outputs"]["last_hidden_state"] = flatten_beam_dim( model_kwargs["encoder_outputs"]["last_hidden_state"] ) if "attention_mask" in model_kwargs: model_kwargs["attention_mask"] = flatten_beam_dim(model_kwargs["attention_mask"]) # initialize model specific kwargs model_kwargs = self.prepare_inputs_for_generation(flatten_beam_dim(input_ids), max_length, **model_kwargs) # initialize state state = BeamSearchState( cur_len=cur_len, running_sequences=running_sequences, running_scores=running_scores, sequences=sequences, scores=scores, is_sent_finished=is_sent_finished, model_kwargs=model_kwargs, ) def beam_search_cond_fn(state): """beam search state termination condition fn.""" # 1. is less than max length? not_max_length_yet = state.cur_len < max_length # 2. can the new beams still improve? best_running_score = state.running_scores[:, -1:] / (max_length**length_penalty) worst_finished_score = jnp.where( state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7) ) improvement_still_possible = jnp.all(worst_finished_score < best_running_score) # 3. is there still a beam that has not finished? still_open_beam = ~(jnp.all(state.is_sent_finished) & early_stopping) return not_max_length_yet & still_open_beam & improvement_still_possible def beam_search_body_fn(state, input_ids_length=1): """beam search state update fn.""" # 1. Forward current tokens # Collect the current position slice along length to feed the fast # autoregressive decoder model. Flatten the beam dimension into batch # dimension for feeding into the model. # unflatten beam dimension # Unflatten beam dimension in attention cache arrays input_token = flatten_beam_dim( lax.dynamic_slice( state.running_sequences, (0, 0, state.cur_len - input_ids_length), (batch_size, num_beams, input_ids_length), ) ) model_outputs = model(input_token, params=params, **state.model_kwargs) logits = unflatten_beam_dim(model_outputs.logits[:, -1], batch_size, num_beams) cache = jax.tree_map( lambda tensor: unflatten_beam_dim(tensor, batch_size, num_beams), model_outputs.past_key_values ) # adapt logits for FlaxMarianMTModel logits = self._adapt_logits_for_beam_search(logits) # 2. Compute log probs # get log probabilities from logits, # process logits with processors (*e.g.* min_length, ...), and # add new logprobs to existing running logprobs scores. log_probs = jax.nn.log_softmax(logits) log_probs = logits_processor( flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), state.cur_len ) log_probs = unflatten_beam_dim(log_probs, batch_size, num_beams) log_probs = log_probs + jnp.expand_dims(state.running_scores, axis=2) vocab_size = log_probs.shape[2] log_probs = log_probs.reshape((batch_size, num_beams * vocab_size)) # 3. Retrieve top-K # Each item in batch has num_beams * vocab_size candidate sequences. # For each item, get the top 2*k candidates with the highest log- # probabilities. We gather the top 2*K beams here so that even if the best # K sequences reach EOS simultaneously, we have another K sequences # remaining to continue the live beam search. # Gather the top 2*K scores from _all_ beams. # Gather 2*k top beams. # Recover the beam index by floor division. # Recover token id by modulo division and expand Id array for broadcasting. # Update sequences for the 2*K top-k new sequences. beams_to_keep = 2 * num_beams topk_log_probs, topk_indices = lax.top_k(log_probs, k=beams_to_keep) topk_beam_indices = topk_indices // vocab_size topk_running_sequences = gather_beams( state.running_sequences, topk_beam_indices, batch_size, beams_to_keep ) topk_ids = jnp.expand_dims(topk_indices % vocab_size, axis=2) topk_sequences = lax.dynamic_update_slice(topk_running_sequences, topk_ids, (0, 0, state.cur_len)) # 4. Check which sequences have ended # Update current sequences: # Did any of these sequences reach an end marker? # To prevent these just finished sequences from being added to the current sequences # set of active beam search sequences, set their log probs to a very large # negative value. did_topk_just_finished = topk_sequences[:, :, state.cur_len] == eos_token_id running_topk_log_probs = topk_log_probs + did_topk_just_finished * np.array(-1.0e7) # 5. Get running sequences scores for next # Determine the top k beam indices (from top 2*k beams) from log probs # and gather top k beams (from top 2*k beams). next_topk_indices = jnp.flip(lax.top_k(running_topk_log_probs, k=num_beams)[1], axis=1) next_running_sequences, next_running_scores = gather_beams( [topk_sequences, running_topk_log_probs], next_topk_indices, batch_size, num_beams ) # 6. Process topk logits # Further process log probs: # - add length penalty # - make sure no scores can be added anymore if beam is full # - make sure still running sequences cannot be chosen as finalized beam topk_log_probs = topk_log_probs / (state.cur_len**length_penalty) beams_in_batch_are_full = ( jnp.broadcast_to(state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape) & early_stopping ) add_penalty = ~did_topk_just_finished | beams_in_batch_are_full topk_log_probs += add_penalty * np.array(-1.0e7) # 7. Get scores, sequences, is sentence finished for next. # Combine sequences, scores, and flags along the beam dimension and compare # new finished sequence scores to existing finished scores and select the # best from the new set of beams merged_sequences = jnp.concatenate([state.sequences, topk_sequences], axis=1) merged_scores = jnp.concatenate([state.scores, topk_log_probs], axis=1) merged_is_sent_finished = jnp.concatenate([state.is_sent_finished, did_topk_just_finished], axis=1) topk_merged_indices = jnp.flip(lax.top_k(merged_scores, k=num_beams)[1], axis=1) next_sequences, next_scores, next_is_sent_finished = gather_beams( [merged_sequences, merged_scores, merged_is_sent_finished], topk_merged_indices, batch_size, num_beams ) # 8. Update model kwargs. # Determine the top k beam indices from the original set of all beams. # With these, gather the top k beam-associated caches. next_running_indices = gather_beams(topk_beam_indices, next_topk_indices, batch_size, num_beams) next_cache = gather_beams(cache, next_running_indices, batch_size, num_beams) model_outputs["past_key_values"] = jax.tree_map(lambda x: flatten_beam_dim(x), next_cache) next_model_kwargs = self.update_inputs_for_generation(model_outputs, state.model_kwargs) return BeamSearchState( cur_len=state.cur_len + 1, running_scores=next_running_scores, running_sequences=next_running_sequences, scores=next_scores, sequences=next_sequences, is_sent_finished=next_is_sent_finished, model_kwargs=next_model_kwargs, ) # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU if input_ids.shape[-1] > 1: state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state) if not trace: state = self._run_loop_in_debug(beam_search_cond_fn, beam_search_body_fn, state) else: state = lax.while_loop(beam_search_cond_fn, beam_search_body_fn, state) # Account for the edge-case where there are no finished sequences for a # particular batch item. If so, return running sequences for that batch item. none_finished = jnp.any(state.is_sent_finished, axis=1) sequences = jnp.where(none_finished[:, None, None], state.sequences, state.running_sequences) scores = jnp.where(none_finished[:, None], state.scores, state.running_scores) # return all beams for each batch and the best score sequences = sequences[:, :] scores = scores[:, -1] return FlaxBeamSearchOutput(sequences=sequences, scores=scores)