Spaces:
Running
Running
import copy | |
import math | |
import warnings | |
from typing import List, Optional, Tuple, Union | |
from transformers import MT5PreTrainedModel | |
from transformers.models.mt5 import MT5Stack | |
from transformers.modeling_outputs import Seq2SeqModelOutput,Seq2SeqLMOutput, BaseModelOutput | |
from transformers.utils import ( | |
add_start_docstrings, | |
add_start_docstrings_to_model_forward, | |
logging, | |
replace_return_docstrings, | |
) | |
from transformers.utils.model_parallel_utils import assert_device_map, get_device_map | |
import torch | |
from torch import nn | |
from torch.nn import CrossEntropyLoss | |
from .config import MT5Config | |
from .docstrings import ( | |
PARALLELIZE_DOCSTRING, | |
DEPARALLELIZE_DOCSTRING, | |
__HEAD_MASK_WARNING_MSG, | |
MT5_START_DOCSTRING, | |
MT5_INPUTS_DOCSTRING, | |
) | |
logger = logging.get_logger(__name__) | |
_CONFIG_FOR_DOC = "MT5Config" | |
_CHECKPOINT_FOR_DOC = "mt5-small" | |
class MT5Model(MT5PreTrainedModel): | |
r""" | |
Examples: | |
```python | |
>>> from transformers import MT5Model, AutoTokenizer | |
>>> model = MT5Model.from_pretrained("google/mt5-small") | |
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") | |
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." | |
>>> summary = "Weiter Verhandlung in Syrien." | |
>>> inputs = tokenizer(article, return_tensors="pt") | |
>>> labels = tokenizer(text_target=summary, return_tensors="pt") | |
>>> outputs = model(input_ids=inputs["input_ids"], decoder_input_ids=labels["input_ids"]) | |
>>> hidden_states = outputs.last_hidden_state | |
```""" | |
model_type = "mt5" | |
config_class = MT5Config | |
_keys_to_ignore_on_load_missing = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] | |
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] | |
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] | |
# Copied from transformers.models.t5.modeling_t5.T5Model.__init__ with T5->MT5 | |
def __init__(self, config: MT5Config): | |
super().__init__(config) | |
self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model) | |
if config.shared_embedding: | |
self.decoder_embedding = self.encoder_embedding | |
else: | |
self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model) | |
encoder_config = copy.deepcopy(config) | |
encoder_config.is_decoder = False | |
encoder_config.use_cache = False | |
encoder_config.is_encoder_decoder = False | |
self.encoder = MT5Stack(encoder_config, self.encoder_embedding) | |
decoder_config = copy.deepcopy(config) | |
decoder_config.is_decoder = True | |
decoder_config.is_encoder_decoder = False | |
decoder_config.num_layers = config.num_decoder_layers | |
self.decoder = MT5Stack(decoder_config, self.decoder_emebedding) | |
# Initialize weights and apply final processing | |
self.post_init() | |
# Model parallel | |
self.model_parallel = False | |
self.device_map = None | |
# Copied from transformers.models.t5.modeling_t5.T5Model.parallelize | |
def parallelize(self, device_map=None): | |
warnings.warn( | |
"`T5Model.parallelize` is deprecated and will be removed in v5 of Transformers, you should load your model" | |
" with `device_map='balanced'` in the call to `from_pretrained`. You can also provide your own" | |
" `device_map` but it needs to be a dictionary module_name to device, so for instance {'encoder.block.0':" | |
" 0, 'encoder.block.1': 1, ...}", | |
FutureWarning, | |
) | |
self.device_map = ( | |
get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) | |
if device_map is None | |
else device_map | |
) | |
assert_device_map(self.device_map, len(self.encoder.block)) | |
self.encoder.parallelize(self.device_map) | |
self.decoder.parallelize(self.device_map) | |
self.model_parallel = True | |
# Copied from transformers.models.t5.modeling_t5.T5Model.deparallelize | |
def deparallelize(self): | |
warnings.warn( | |
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", | |
FutureWarning, | |
) | |
self.encoder.deparallelize() | |
self.decoder.deparallelize() | |
self.encoder = self.encoder.to("cpu") | |
self.decoder = self.decoder.to("cpu") | |
self.model_parallel = False | |
self.device_map = None | |
torch.cuda.empty_cache() | |
# Copied from transformers.models.t5.modeling_t5.T5Model.get_input_embeddings | |
def get_input_embeddings(self): | |
return self.encoder_embedding | |
# Copied from transformers.models.t5.modeling_t5.T5Model.set_input_embeddings | |
def set_input_embeddings(self, new_embeddings): | |
self.encoder_embedding = new_embeddings | |
self.encoder.set_input_embeddings(new_embeddings) | |
self.decoder.set_input_embeddings(new_embeddings) | |
# Copied from transformers.models.t5.modeling_t5.T5Model.get_encoder | |
def get_encoder(self): | |
return self.encoder | |
# Copied from transformers.models.t5.modeling_t5.T5Model.get_decoder | |
def get_decoder(self): | |
return self.decoder | |
# Copied from transformers.models.t5.modeling_t5.T5Model._prune_heads | |
def _prune_heads(self, heads_to_prune): | |
""" | |
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base | |
class PreTrainedModel | |
""" | |
for layer, heads in heads_to_prune.items(): | |
self.encoder.layer[layer].attention.prune_heads(heads) | |
# Copied from transformers.models.t5.modeling_t5.T5Model.forward with T5->MT5, t5->mt5 | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
decoder_input_ids: Optional[torch.LongTensor] = None, | |
decoder_attention_mask: Optional[torch.BoolTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
decoder_head_mask: Optional[torch.FloatTensor] = None, | |
cross_attn_head_mask: Optional[torch.Tensor] = None, | |
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, | |
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, | |
inputs_embeds: Optional[torch.Tensor] = None, | |
decoder_inputs_embeds: Optional[torch.Tensor] = None, | |
use_cache: Optional[bool] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: | |
r""" | |
Returns: | |
Example: | |
```python | |
>>> from transformers import AutoTokenizer, MT5Model | |
>>> tokenizer = AutoTokenizer.from_pretrained("mt5-small") | |
>>> model = MT5Model.from_pretrained("mt5-small") | |
>>> input_ids = tokenizer( | |
... "Studies have been shown that owning a dog is good for you", return_tensors="pt" | |
... ).input_ids # Batch size 1 | |
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 | |
>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MT5Model. | |
>>> # This is not needed for torch's MT5ForConditionalGeneration as it does this internally using labels arg. | |
>>> decoder_input_ids = model._shift_right(decoder_input_ids) | |
>>> # forward pass | |
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) | |
>>> last_hidden_states = outputs.last_hidden_state | |
```""" | |
use_cache = use_cache if use_cache is not None else self.config.use_cache | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask | |
if head_mask is not None and decoder_head_mask is None: | |
if self.config.num_layers == self.config.num_decoder_layers: | |
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) | |
decoder_head_mask = head_mask | |
# Encode if needed (training, first prediction pass) | |
if encoder_outputs is None: | |
encoder_outputs = self.encoder( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
inputs_embeds=inputs_embeds, | |
head_mask=head_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): | |
encoder_outputs = BaseModelOutput( | |
last_hidden_state=encoder_outputs[0], | |
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, | |
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, | |
) | |
hidden_states = encoder_outputs[0] | |
# Set device for model parallelism | |
if self.model_parallel: | |
torch.cuda.set_device(self.decoder.first_device) | |
hidden_states = hidden_states.to(self.decoder.first_device) | |
if decoder_input_ids is not None: | |
decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) | |
if attention_mask is not None: | |
attention_mask = attention_mask.to(self.decoder.first_device) | |
if decoder_attention_mask is not None: | |
decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) | |
# Decode | |
decoder_outputs = self.decoder( | |
input_ids=decoder_input_ids, | |
attention_mask=decoder_attention_mask, | |
inputs_embeds=decoder_inputs_embeds, | |
past_key_values=past_key_values, | |
encoder_hidden_states=hidden_states, | |
encoder_attention_mask=attention_mask, | |
head_mask=decoder_head_mask, | |
cross_attn_head_mask=cross_attn_head_mask, | |
use_cache=use_cache, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
if not return_dict: | |
return decoder_outputs + encoder_outputs | |
return Seq2SeqModelOutput( | |
last_hidden_state=decoder_outputs.last_hidden_state, | |
past_key_values=decoder_outputs.past_key_values, | |
decoder_hidden_states=decoder_outputs.hidden_states, | |
decoder_attentions=decoder_outputs.attentions, | |
cross_attentions=decoder_outputs.cross_attentions, | |
encoder_last_hidden_state=encoder_outputs.last_hidden_state, | |
encoder_hidden_states=encoder_outputs.hidden_states, | |
encoder_attentions=encoder_outputs.attentions, | |
) | |
class MT5ForConditionalGeneration(MT5PreTrainedModel): | |
r""" | |
Examples: | |
```python | |
>>> from transformers import MT5ForConditionalGeneration, AutoTokenizer | |
>>> model = MT5ForConditionalGeneration.from_pretrained("google/mt5-small") | |
>>> tokenizer = AutoTokenizer.from_pretrained("google/mt5-small") | |
>>> article = "UN Offizier sagt, dass weiter verhandelt werden muss in Syrien." | |
>>> summary = "Weiter Verhandlung in Syrien." | |
>>> inputs = tokenizer(article, text_target=summary, return_tensors="pt") | |
>>> outputs = model(**inputs) | |
>>> loss = outputs.loss | |
```""" | |
model_type = "mt5" | |
config_class = MT5Config | |
_keys_to_ignore_on_load_unexpected = ["decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight"] | |
_tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.__init__ with T5->MT5 | |
def __init__(self, config: MT5Config): | |
super().__init__(config) | |
self.model_dim = config.d_model | |
self.encoder_embedding = nn.Embedding(config.encoder_vocab_size, config.d_model) | |
if config.shared_embedding: | |
self.decoder_embedding = self.encoder_embedding | |
else: | |
self.decoder_emebedding = nn.Embedding(config.decoder_vocab_size, config.d_model) | |
encoder_config = copy.deepcopy(config) | |
encoder_config.is_decoder = False | |
encoder_config.use_cache = False | |
encoder_config.is_encoder_decoder = False | |
self.encoder = MT5Stack(encoder_config, self.encoder_embedding) | |
decoder_config = copy.deepcopy(config) | |
decoder_config.is_decoder = True | |
decoder_config.is_encoder_decoder = False | |
decoder_config.num_layers = config.num_decoder_layers | |
self.decoder = MT5Stack(decoder_config, self.decoder_emebedding) | |
self.lm_head = nn.Linear(config.d_model, config.decoder_vocab_size, bias=False) | |
# Initialize weights and apply final processing | |
self.post_init() | |
# Model parallel | |
self.model_parallel = False | |
self.device_map = None | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.parallelize | |
def parallelize(self, device_map=None): | |
warnings.warn( | |
"`T5ForConditionalGeneration.parallelize` is deprecated and will be removed in v5 of Transformers, you" | |
" should load your model with `device_map='balanced'` in the call to `from_pretrained`. You can also" | |
" provide your own `device_map` but it needs to be a dictionary module_name to device, so for instance" | |
" {'encoder.block.0': 0, 'encoder.block.1': 1, ...}", | |
FutureWarning, | |
) | |
self.device_map = ( | |
get_device_map(len(self.encoder.block), range(torch.cuda.device_count())) | |
if device_map is None | |
else device_map | |
) | |
assert_device_map(self.device_map, len(self.encoder.block)) | |
self.encoder.parallelize(self.device_map) | |
self.decoder.parallelize(self.device_map) | |
self.lm_head = self.lm_head.to(self.decoder.first_device) | |
self.model_parallel = True | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.deparallelize | |
def deparallelize(self): | |
warnings.warn( | |
"Like `parallelize`, `deparallelize` is deprecated and will be removed in v5 of Transformers.", | |
FutureWarning, | |
) | |
self.encoder.deparallelize() | |
self.decoder.deparallelize() | |
self.encoder = self.encoder.to("cpu") | |
self.decoder = self.decoder.to("cpu") | |
self.lm_head = self.lm_head.to("cpu") | |
self.model_parallel = False | |
self.device_map = None | |
torch.cuda.empty_cache() | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_input_embeddings | |
def get_input_embeddings(self): | |
return self.encoder_embedding | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_input_embeddings | |
def set_input_embeddings(self, new_embeddings): | |
self.encoder_embedding = new_embeddings | |
self.encoder.set_input_embeddings(new_embeddings) | |
self.decoder.set_input_embeddings(new_embeddings) | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.set_output_embeddings | |
def set_output_embeddings(self, new_embeddings): | |
self.lm_head = new_embeddings | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_output_embeddings | |
def get_output_embeddings(self): | |
return self.lm_head | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_encoder | |
def get_encoder(self): | |
return self.encoder | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.get_decoder | |
def get_decoder(self): | |
return self.decoder | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.forward with T5->MT5, t5->mt5 | |
def forward( | |
self, | |
input_ids: Optional[torch.LongTensor] = None, | |
attention_mask: Optional[torch.FloatTensor] = None, | |
decoder_input_ids: Optional[torch.LongTensor] = None, | |
decoder_attention_mask: Optional[torch.BoolTensor] = None, | |
head_mask: Optional[torch.FloatTensor] = None, | |
decoder_head_mask: Optional[torch.FloatTensor] = None, | |
cross_attn_head_mask: Optional[torch.Tensor] = None, | |
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, | |
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, | |
inputs_embeds: Optional[torch.FloatTensor] = None, | |
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, | |
labels: Optional[torch.LongTensor] = None, | |
use_cache: Optional[bool] = None, | |
output_attentions: Optional[bool] = None, | |
output_hidden_states: Optional[bool] = None, | |
return_dict: Optional[bool] = None, | |
) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: | |
r""" | |
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): | |
Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., | |
config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for | |
labels in `[0, ..., config.vocab_size]` | |
Returns: | |
Examples: | |
```python | |
>>> from transformers import AutoTokenizer, MT5ForConditionalGeneration | |
>>> tokenizer = AutoTokenizer.from_pretrained("mt5-small") | |
>>> model = MT5ForConditionalGeneration.from_pretrained("mt5-small") | |
>>> # training | |
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids | |
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids | |
>>> outputs = model(input_ids=input_ids, labels=labels) | |
>>> loss = outputs.loss | |
>>> logits = outputs.logits | |
>>> # inference | |
>>> input_ids = tokenizer( | |
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" | |
... ).input_ids # Batch size 1 | |
>>> outputs = model.generate(input_ids) | |
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) | |
>>> # studies have shown that owning a dog is good for you. | |
```""" | |
use_cache = use_cache if use_cache is not None else self.config.use_cache | |
return_dict = return_dict if return_dict is not None else self.config.use_return_dict | |
# FutureWarning: head_mask was separated into two input args - head_mask, decoder_head_mask | |
if head_mask is not None and decoder_head_mask is None: | |
if self.config.num_layers == self.config.num_decoder_layers: | |
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) | |
decoder_head_mask = head_mask | |
# Encode if needed (training, first prediction pass) | |
if encoder_outputs is None: | |
# Convert encoder inputs in embeddings if needed | |
encoder_outputs = self.encoder( | |
input_ids=input_ids, | |
attention_mask=attention_mask, | |
inputs_embeds=inputs_embeds, | |
head_mask=head_mask, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): | |
encoder_outputs = BaseModelOutput( | |
last_hidden_state=encoder_outputs[0], | |
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, | |
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, | |
) | |
hidden_states = encoder_outputs[0] | |
if self.model_parallel: | |
torch.cuda.set_device(self.decoder.first_device) | |
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: | |
# get decoder inputs from shifting lm labels to the right | |
decoder_input_ids = self._shift_right(labels) | |
# Set device for model parallelism | |
if self.model_parallel: | |
torch.cuda.set_device(self.decoder.first_device) | |
hidden_states = hidden_states.to(self.decoder.first_device) | |
if decoder_input_ids is not None: | |
decoder_input_ids = decoder_input_ids.to(self.decoder.first_device) | |
if attention_mask is not None: | |
attention_mask = attention_mask.to(self.decoder.first_device) | |
if decoder_attention_mask is not None: | |
decoder_attention_mask = decoder_attention_mask.to(self.decoder.first_device) | |
# Decode | |
decoder_outputs = self.decoder( | |
input_ids=decoder_input_ids, | |
attention_mask=decoder_attention_mask, | |
inputs_embeds=decoder_inputs_embeds, | |
past_key_values=past_key_values, | |
encoder_hidden_states=hidden_states, | |
encoder_attention_mask=attention_mask, | |
head_mask=decoder_head_mask, | |
cross_attn_head_mask=cross_attn_head_mask, | |
use_cache=use_cache, | |
output_attentions=output_attentions, | |
output_hidden_states=output_hidden_states, | |
return_dict=return_dict, | |
) | |
sequence_output = decoder_outputs[0] | |
# Set device for model parallelism | |
if self.model_parallel: | |
torch.cuda.set_device(self.encoder.first_device) | |
self.lm_head = self.lm_head.to(self.encoder.first_device) | |
sequence_output = sequence_output.to(self.lm_head.weight.device) | |
if self.config.tie_word_embeddings: | |
# Rescale output before projecting on vocab | |
# See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 | |
sequence_output = sequence_output * (self.model_dim**-0.5) | |
lm_logits = self.lm_head(sequence_output) | |
loss = None | |
if labels is not None: | |
loss_fct = CrossEntropyLoss(ignore_index=-100) | |
# move labels to correct device to enable PP | |
labels = labels.to(lm_logits.device) | |
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) | |
# TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 | |
if not return_dict: | |
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs | |
return ((loss,) + output) if loss is not None else output | |
return Seq2SeqLMOutput( | |
loss=loss, | |
logits=lm_logits, | |
past_key_values=decoder_outputs.past_key_values, | |
decoder_hidden_states=decoder_outputs.hidden_states, | |
decoder_attentions=decoder_outputs.attentions, | |
cross_attentions=decoder_outputs.cross_attentions, | |
encoder_last_hidden_state=encoder_outputs.last_hidden_state, | |
encoder_hidden_states=encoder_outputs.hidden_states, | |
encoder_attentions=encoder_outputs.attentions, | |
) | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_inputs_for_generation | |
def prepare_inputs_for_generation( | |
self, | |
input_ids, | |
past_key_values=None, | |
attention_mask=None, | |
head_mask=None, | |
decoder_head_mask=None, | |
decoder_attention_mask=None, | |
cross_attn_head_mask=None, | |
use_cache=None, | |
encoder_outputs=None, | |
**kwargs, | |
): | |
# cut decoder_input_ids if past_key_values is used | |
if past_key_values is not None: | |
past_length = past_key_values[0][0].shape[2] | |
# Some generation methods already pass only the last input ID | |
if input_ids.shape[1] > past_length: | |
remove_prefix_length = past_length | |
else: | |
# Default to old behavior: keep only final ID | |
remove_prefix_length = input_ids.shape[1] - 1 | |
input_ids = input_ids[:, remove_prefix_length:] | |
return { | |
"decoder_input_ids": input_ids, | |
"past_key_values": past_key_values, | |
"encoder_outputs": encoder_outputs, | |
"attention_mask": attention_mask, | |
"head_mask": head_mask, | |
"decoder_head_mask": decoder_head_mask, | |
"decoder_attention_mask": decoder_attention_mask, | |
"cross_attn_head_mask": cross_attn_head_mask, | |
"use_cache": use_cache, | |
} | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration.prepare_decoder_input_ids_from_labels | |
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): | |
return self._shift_right(labels) | |
# Copied from transformers.models.t5.modeling_t5.T5ForConditionalGeneration._reorder_cache | |
def _reorder_cache(self, past_key_values, beam_idx): | |
# if decoder past is not included in output | |
# speedy decoding is disabled and no need to reorder | |
if past_key_values is None: | |
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") | |
return past_key_values | |
reordered_decoder_past = () | |
for layer_past_states in past_key_values: | |
# get the correct batch idx from layer past batch dim | |
# batch dim of `past` is at 2nd position | |
reordered_layer_past_states = () | |
for layer_past_state in layer_past_states: | |
# need to set correct `past` for each of the four key / value states | |
reordered_layer_past_states = reordered_layer_past_states + ( | |
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)), | |
) | |
if reordered_layer_past_states[0].shape != layer_past_states[0].shape: | |
raise ValueError( | |
f"reordered_layer_past_states[0] shape {reordered_layer_past_states[0].shape} and layer_past_states[0] shape {layer_past_states[0].shape} mismatched" | |
) | |
if len(reordered_layer_past_states) != len(layer_past_states): | |
raise ValueError( | |
f"length of reordered_layer_past_states {len(reordered_layer_past_states)} and length of layer_past_states {len(layer_past_states)} mismatched" | |
) | |
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) | |
return reordered_decoder_past |