|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
""" PyTorch Manta model.""" |
|
|
|
|
|
import math |
|
from dataclasses import dataclass |
|
import warnings |
|
from typing import Optional, Tuple, Union |
|
|
|
import torch |
|
from torch import nn |
|
from torch.nn import CrossEntropyLoss |
|
|
|
from transformers.modeling_outputs import BaseModelOutput, Seq2SeqLMOutput, Seq2SeqModelOutput |
|
from transformers.modeling_utils import PreTrainedModel |
|
from transformers.models.longformer import LongformerConfig, LongformerModel |
|
from transformers.models.t5.configuration_t5 import T5Config |
|
from transformers.models.t5.modeling_t5 import ( |
|
__HEAD_MASK_WARNING_MSG, |
|
T5Attention, |
|
T5Stack, |
|
) |
|
from transformers.utils import ( |
|
DUMMY_INPUTS, |
|
DUMMY_MASK, |
|
add_start_docstrings, |
|
add_end_docstrings, |
|
is_torch_fx_proxy, |
|
logging, |
|
replace_return_docstrings, |
|
) |
|
from configuration_manta import MantaConfig |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
_CONFIG_FOR_DOC = "MantaConfig" |
|
_TOKENIZER_FOR_DOC = "ByT5Tokenizer" |
|
|
|
MANTA_PRETRAINED_MODEL_ARCHIVE_LIST = [] |
|
|
|
|
|
def gaussian_pdf(x): |
|
return torch.exp(-x * x / 2.0) |
|
|
|
|
|
def pad_block_embeddings(block_embeddings, pad_length): |
|
if not pad_length: |
|
return block_embeddings |
|
|
|
padding_tensor_len = max(pad_length - block_embeddings.size(1), 0) |
|
|
|
padding_tensor = torch.zeros( |
|
(block_embeddings.size(0), padding_tensor_len, block_embeddings.size(2)), |
|
device=block_embeddings.device, |
|
dtype=block_embeddings.dtype, |
|
) |
|
return torch.cat([block_embeddings[:, :pad_length, :], padding_tensor], dim=1) |
|
|
|
|
|
@add_end_docstrings() |
|
@dataclass |
|
class MantaSeq2SeqLMOutput(Seq2SeqLMOutput): |
|
""" |
|
Base class for Manta encoder's outputs that also contains : pre-computed hidden states that can speed up sequential |
|
decoding. |
|
|
|
Args: |
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
Sequence of hidden-states at the output of the last layer of the decoder of the model. |
|
|
|
If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, |
|
hidden_size)` is output. |
|
past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): |
|
Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape |
|
`(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape |
|
`(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. |
|
|
|
Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention |
|
blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. |
|
decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. |
|
decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the |
|
self-attention heads. |
|
cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the |
|
weighted average in the cross-attention heads. |
|
encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): |
|
Sequence of hidden-states at the output of the last layer of the encoder of the model. |
|
encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. |
|
encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the |
|
self-attention heads. |
|
frontier_predictions: (`torch.FloatTensor`, *optional*, of shape `(batch_size, sequence_length, 1)`): |
|
Probability scores of being a frontier as predicted by the FrontierPredictor module. |
|
""" |
|
|
|
frontier_predictions: Optional[torch.FloatTensor] = None |
|
|
|
|
|
@dataclass |
|
class MantaBaseModelOutput(BaseModelOutput): |
|
""" |
|
Base class for Manta's outputs, with potential hidden states, attentions and Manta's frontier predictions. |
|
|
|
Args: |
|
last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): |
|
Sequence of hidden-states at the output of the last layer of the model. |
|
hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): |
|
Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + |
|
one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. |
|
|
|
Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. |
|
attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): |
|
Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, |
|
sequence_length)`. |
|
|
|
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention |
|
heads. |
|
frontier_predictions: (`torch.FloatTensor`, *optional*, of shape `(batch_size, sequence_length, 1)`): |
|
Probability scores of being a frontier as predicted by the FrontierPredictor module. |
|
""" |
|
|
|
frontier_predictions: Optional[torch.FloatTensor] = None |
|
|
|
|
|
class MantaFrontierPredictor(nn.Module): |
|
def __init__( |
|
self, |
|
hidden_size, |
|
num_layers, |
|
num_attention_heads, |
|
dropout_rate, |
|
attention_window, |
|
max_length, |
|
): |
|
super().__init__() |
|
|
|
|
|
|
|
max_position_embeddings = (max_length // attention_window + 1) * attention_window + 1 |
|
self.hidden_size = hidden_size |
|
|
|
self.config = LongformerConfig( |
|
attention_probs_dropout_prob=dropout_rate, |
|
attention_window=attention_window, |
|
hidden_act="gelu", |
|
hidden_dropout_prob=dropout_rate, |
|
hidden_size=hidden_size, |
|
intermediate_size=hidden_size * 4, |
|
max_position_embeddings=max_position_embeddings, |
|
num_attention_heads=num_attention_heads, |
|
num_hidden_layers=num_layers, |
|
position_embedding_type="absolute", |
|
vocab_size=1, |
|
pad_token_id=0, |
|
) |
|
self.local_transformer = LongformerModel(self.config) |
|
|
|
self.output_projection = nn.Linear(hidden_size, 1) |
|
|
|
def forward(self, embeddings, attention_mask): |
|
longformer_output = self.local_transformer(inputs_embeds=embeddings, attention_mask=attention_mask) |
|
|
|
projection_outputs = self.output_projection(longformer_output.last_hidden_state) |
|
|
|
frontier_predictions = torch.sigmoid(projection_outputs.squeeze(-1)) |
|
|
|
return frontier_predictions |
|
|
|
|
|
class MantaConvFeatures(nn.Module): |
|
def __init__( |
|
self, |
|
in_channels, |
|
out_channels, |
|
kernel_size, |
|
groups, |
|
padding, |
|
): |
|
""" |
|
This nn.Module "decomposes" the convolution in order to extract and cache feature maps. This amounts to |
|
computing an element-wise multiplication between weights of size (hidden_dim, kernel_size) and the input. |
|
""" |
|
super().__init__() |
|
self.in_channels = in_channels |
|
self.out_channels = out_channels |
|
self.kernel_size = kernel_size |
|
self.groups = groups |
|
self.padding = padding |
|
|
|
if groups == in_channels: |
|
assert ( |
|
in_channels == out_channels |
|
), "When using `groups = in_channels`, make sure to have `in_channels == out_channels`" |
|
self.weight = nn.Parameter(torch.Tensor(1, 1, kernel_size, out_channels)) |
|
elif self.groups == 1: |
|
self.weight = nn.Parameter(torch.Tensor(in_channels, out_channels, kernel_size)) |
|
else: |
|
raise ValueError("MantaConvFeatures only supports `groups = 1` or `groups = in_channels`") |
|
|
|
left_pad = (kernel_size - 1) // 2 |
|
self.pad = (left_pad, kernel_size - 1 - left_pad) |
|
|
|
self.reset_parameters() |
|
|
|
def reset_parameters(self): |
|
""" |
|
See https://pytorch.org/docs/stable/_modules/torch/nn/modules/conv.html#Conv1d, in the `_ConvNd` class : |
|
> Setting a=sqrt(5) in kaiming_uniform is the same as initializing with |
|
> uniform(-1/sqrt(k), 1/sqrt(k)), where k = weight.size(1) * prod(*kernel_size) |
|
> For more details see: https://github.com/pytorch/pytorch/issues/15314#issuecomment-477448573" |
|
|
|
The reason we permute the weights before init is because `kaiming_uniform_` uses the number of in and out |
|
features for initialization, which are computed as tensor.size(0) and tensor.size(1). However, these |
|
dimensions do not correspond for my weights. |
|
""" |
|
if self.groups == self.out_channels: |
|
nn.init.kaiming_uniform_(self.weight.permute(3, 0, 1, 2), a=math.sqrt(5)) |
|
else: |
|
nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5)) |
|
|
|
def forward(self, x: torch.Tensor): |
|
if self.groups == 1: |
|
return self.forward_matmul(x) |
|
else: |
|
return self.forward_elementwise(x) |
|
|
|
def forward_matmul(self, x: torch.Tensor): |
|
|
|
if self.padding == "same": |
|
padded_x = self._pad_pre_conv(x) |
|
else: |
|
padded_x = x |
|
|
|
bs, _, seq_len = padded_x.size() |
|
|
|
padded_x = padded_x.transpose(-1, -2) |
|
|
|
|
|
out = padded_x.matmul(self.weight.view(self.weight.size(0), -1)).view(bs, seq_len, self.out_channels, -1) |
|
|
|
|
|
return out.permute(0, 2, 3, 1) |
|
|
|
def forward_elementwise(self, x: torch.Tensor): |
|
assert len(x.size()) == 3 |
|
assert x.size(1) == self.out_channels |
|
|
|
|
|
if self.padding == "same": |
|
padded_x = self._pad_pre_conv(x) |
|
else: |
|
padded_x = x |
|
|
|
|
|
padded_x = padded_x.transpose(-1, -2).unsqueeze(2) |
|
|
|
|
|
out = padded_x * self.weight |
|
|
|
|
|
return out.transpose(1, 3) |
|
|
|
def _pad_pre_conv(self, inp: torch.Tensor): |
|
""" |
|
Pad with zeros at the beginning and end just like `nn.Conv1d`. |
|
""" |
|
return nn.functional.pad(inp, self.pad, "constant", 0.0) |
|
|
|
def extra_repr(self): |
|
return "in_features={}, out_features={}, kernel_size={}, groups={}".format( |
|
self.in_channels, self.out_channels, self.kernel_size, self.groups |
|
) |
|
|
|
|
|
class MantaCachedConvolutionPooling(nn.Module): |
|
def __init__( |
|
self, |
|
padding_length, |
|
output_dim, |
|
kernel_size, |
|
hidden_dim, |
|
depthwise_convolution, |
|
variance_regularization, |
|
mean_pool, |
|
): |
|
super().__init__() |
|
self.padding_length = padding_length |
|
self.output_dim = output_dim |
|
self.kernel_size = kernel_size |
|
self.hidden_dim = hidden_dim |
|
self.depthwise_convolution = depthwise_convolution |
|
self.variance_regularization = variance_regularization |
|
self.mean_pool = mean_pool |
|
|
|
if isinstance(self.kernel_size, int): |
|
self.kernel_size = [[self.kernel_size, hidden_dim]] |
|
|
|
self.conv_output_dim = sum([k_dim[1] for k_dim in self.kernel_size]) |
|
|
|
|
|
|
|
self.out_projection = nn.Linear(self.conv_output_dim, self.output_dim, bias=True) |
|
|
|
self.conv_layers = nn.Sequential( |
|
*[ |
|
MantaConvFeatures(self.hidden_dim, h, k, groups=h if self.depthwise_convolution else 1, padding="same") |
|
for (k, h) in self.kernel_size |
|
] |
|
) |
|
|
|
self.eps = None |
|
self.conv_layer = None |
|
|
|
def forward(self, unconstrained_separation_probs: torch.Tensor, byte_embeddings: torch.Tensor): |
|
device = unconstrained_separation_probs.device |
|
if self.eps is None: |
|
self.eps = 5 * torch.finfo(unconstrained_separation_probs.dtype).resolution |
|
self.variance_regularization = max(self.eps, self.variance_regularization) |
|
|
|
if self.conv_layer is not None: |
|
self.conv_layer = self.conv_layer.to(device) |
|
batch_size, seq_len = byte_embeddings.shape[:2] |
|
|
|
|
|
separation_probs = unconstrained_separation_probs.clone() |
|
separation_probs[:, 0] = 0 |
|
|
|
assert separation_probs.shape == (batch_size, seq_len) |
|
|
|
|
|
block_id_expectation = separation_probs.cumsum(axis=-1) |
|
block_id_std = torch.sqrt( |
|
(separation_probs * (1.0 - separation_probs)).cumsum(axis=-1) + self.variance_regularization |
|
) |
|
|
|
|
|
max_nb_blocks = min(seq_len, (block_id_expectation + 3 * block_id_std).max().int().item() + 1) |
|
possible_blocks_id = torch.arange(max_nb_blocks).to(device) |
|
|
|
|
|
log_scale = block_id_std[:, None, :].log() |
|
log_proba = ( |
|
-((block_id_expectation[:, None, :] - possible_blocks_id[None, :, None]) ** 2) |
|
/ (2 * block_id_std[:, None, :]) |
|
- log_scale |
|
- math.log((2 * math.pi) ** 0.5) |
|
) |
|
block_byte_proba = log_proba.softmax(-2) |
|
|
|
token_size = block_byte_proba.sum(-1, keepdim=True) |
|
regularized_token_size = torch.maximum(token_size, torch.ones_like(token_size)) |
|
|
|
if self.mean_pool: |
|
block_byte_proba_normalized = block_byte_proba / regularized_token_size |
|
else: |
|
|
|
block_byte_proba_normalized = block_byte_proba |
|
|
|
block_embeddings = self.pooling(byte_embeddings, block_byte_proba_normalized) |
|
|
|
pad_length = min(self.padding_length, max_nb_blocks) |
|
|
|
block_embeddings = pad_block_embeddings(block_embeddings, pad_length) |
|
block_embeddings = self.out_projection(block_embeddings) |
|
|
|
return block_embeddings |
|
|
|
def pooling(self, embeddings: torch.Tensor, block_byte_proba: torch.Tensor): |
|
block_embeddings = [] |
|
|
|
for conv_layer in self.conv_layers: |
|
|
|
|
|
features = conv_layer(embeddings.transpose(1, 2)).permute(0, 3, 1, 2) |
|
|
|
|
|
pad = conv_layer.pad |
|
|
|
for i in range(0, conv_layer.kernel_size): |
|
|
|
features[..., i] = features[..., i].roll(pad[0] - i, 1) |
|
|
|
features = features[:, pad[1] : features.size(1) - pad[0]] |
|
|
|
|
|
|
|
padded_block_byte_proba = nn.functional.pad(block_byte_proba, pad, "constant", 0.0) |
|
expanded_block_byte_proba = [] |
|
for i in range(0, conv_layer.kernel_size): |
|
rolled_proba = padded_block_byte_proba.clone().roll(pad[0] - i, -1) |
|
expanded_block_byte_proba.append(rolled_proba) |
|
expanded_block_byte_proba = torch.stack(expanded_block_byte_proba, -1) |
|
|
|
expanded_block_byte_proba = expanded_block_byte_proba[ |
|
:, :, pad[1] : expanded_block_byte_proba.size(2) - pad[0], : |
|
] |
|
|
|
|
|
if self.mean_pool: |
|
convolved = torch.einsum("b s h k, b B s k -> b B h", features, expanded_block_byte_proba) |
|
else: |
|
convolved = torch.einsum("b s h k, b B s k -> b B s h", features, expanded_block_byte_proba) |
|
convolved = convolved.max(dim=-2).values |
|
|
|
block_embeddings.append(convolved) |
|
|
|
block_embeddings = torch.cat(block_embeddings, dim=-1) |
|
|
|
return block_embeddings |
|
|
|
|
|
class MantaPreTrainedModel(PreTrainedModel): |
|
""" |
|
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained |
|
models. |
|
""" |
|
|
|
config_class = MantaConfig |
|
base_model_prefix = "transformer" |
|
supports_gradient_checkpointing = True |
|
|
|
def _init_weights(self, module): |
|
"""Initialize the weights""" |
|
pass |
|
|
|
def _set_gradient_checkpointing(self, module, value=False): |
|
if isinstance(module, (T5Attention, T5Stack)): |
|
module.gradient_checkpointing = value |
|
|
|
def _shift_right(self, input_ids): |
|
decoder_start_token_id = self.config.decoder_start_token_id |
|
pad_token_id = self.config.pad_token_id |
|
|
|
assert decoder_start_token_id is not None, ( |
|
"self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id." |
|
" See T5 docs for more information" |
|
) |
|
|
|
|
|
if is_torch_fx_proxy(input_ids): |
|
|
|
shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id) |
|
shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1) |
|
else: |
|
shifted_input_ids = input_ids.new_zeros(input_ids.shape) |
|
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() |
|
shifted_input_ids[..., 0] = decoder_start_token_id |
|
|
|
assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." |
|
|
|
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) |
|
|
|
return shifted_input_ids |
|
|
|
|
|
@add_start_docstrings( |
|
"The bare Manta Model transformer outputting encoder's raw hidden-states without any specific head on top." |
|
) |
|
class MantaEncoderModel(MantaPreTrainedModel): |
|
authorized_missing_keys = [ |
|
r"encoder.embed_tokens.weight", |
|
] |
|
|
|
def __init__(self, config: MantaConfig): |
|
super().__init__(config) |
|
self.byte_embeddings = nn.Embedding(config.vocab_size, config.byte_embedding_dim) |
|
|
|
self.frontier_predictor = MantaFrontierPredictor( |
|
hidden_size=config.byte_embedding_dim, |
|
num_layers=config.frontier_predictor_num_layers, |
|
num_attention_heads=config.frontier_predictor_num_attention_heads, |
|
dropout_rate=config.dropout_rate, |
|
attention_window=config.frontier_predictor_attention_window, |
|
max_length=config.max_length_inputs, |
|
) |
|
|
|
self.pooler = MantaCachedConvolutionPooling( |
|
padding_length=config.max_length_encoder_decoder, |
|
output_dim=config.d_model, |
|
kernel_size=config.pooling_kernel_size, |
|
hidden_dim=config.byte_embedding_dim, |
|
depthwise_convolution=config.pooling_depthwise_convolution, |
|
variance_regularization=config.pooling_variance_regularization, |
|
mean_pool=config.pooling_mean_pool, |
|
) |
|
|
|
self.t5_encoder = T5Stack( |
|
T5Config( |
|
d_model=config.d_model, |
|
d_kv=config.d_kv, |
|
d_ff=config.d_ff, |
|
num_layers=config.num_layers, |
|
num_heads=config.num_heads, |
|
relative_attention_num_buckets=config.relative_attention_num_buckets, |
|
relative_attention_max_distance=config.relative_attention_max_distance, |
|
dropout_rate=config.dropout_rate, |
|
layer_norm_epsilon=config.layer_norm_epsilon, |
|
initializer_factor=config.initializer_factor, |
|
feed_forward_proj=config.feed_forward_proj, |
|
pad_token_id=config.pad_token_id, |
|
eos_token_id=config.eos_token_id, |
|
is_decoder=False, |
|
use_cache=False, |
|
) |
|
) |
|
|
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.byte_embeddings |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.byte_embeddings = new_embeddings |
|
|
|
def _prune_heads(self, heads_to_prune): |
|
""" |
|
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base |
|
class PreTrainedModel |
|
""" |
|
for layer, heads in heads_to_prune.items(): |
|
self.t5_encoder.block[layer].layer[0].SelfAttention.prune_heads(heads) |
|
|
|
def _compute_pooled_representations( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
): |
|
if inputs_embeds is None and input_ids is None: |
|
return None |
|
|
|
byte_embeddings = inputs_embeds if inputs_embeds is not None else self.byte_embeddings(input_ids) |
|
|
|
frontier_predictions = self.frontier_predictor(byte_embeddings, attention_mask) |
|
|
|
pooled_representations = self.pooler(frontier_predictions, byte_embeddings) |
|
|
|
return pooled_representations, frontier_predictions |
|
|
|
@replace_return_docstrings(output_type=MantaBaseModelOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
) -> Union[Tuple[torch.FloatTensor], MantaBaseModelOutput]: |
|
r""" |
|
Returns: |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import ByT5Tokenizer, MantaEncoderModel |
|
|
|
>>> tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small") |
|
>>> model = MantaEncoderModel.from_pretrained("nthngdy/manta-small") |
|
>>> input_ids = tokenizer( |
|
... "Studies have been shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> outputs = model(input_ids=input_ids) |
|
>>> last_hidden_states = outputs.last_hidden_state |
|
```""" |
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
pooled_representations, frontier_predictions = self._compute_pooled_representations( |
|
input_ids, attention_mask, inputs_embeds |
|
) |
|
|
|
encoder_outputs = self.t5_encoder( |
|
inputs_embeds=pooled_representations, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
if not return_dict: |
|
return encoder_outputs + (frontier_predictions,) |
|
|
|
return MantaBaseModelOutput(frontier_predictions=frontier_predictions, **encoder_outputs) |
|
|
|
|
|
class MantaModel(MantaPreTrainedModel): |
|
_keys_to_ignore_on_load_missing = [ |
|
r"encoder_decoder.encoder.embed_tokens.weight", |
|
r"encoder_decoder.decoder.embed_tokens.weight", |
|
] |
|
_keys_to_ignore_on_load_unexpected = [ |
|
r"encoder_decoder.decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", |
|
] |
|
|
|
def __init__(self, config: MantaConfig): |
|
super().__init__(config) |
|
|
|
self.encoder = MantaEncoderModel(config) |
|
|
|
self.decoder_embeddings = nn.Embedding(config.vocab_size, config.d_model) |
|
self.decoder = T5Stack( |
|
T5Config( |
|
vocab_size=config.vocab_size, |
|
d_model=config.d_model, |
|
d_kv=config.d_kv, |
|
d_ff=config.d_ff, |
|
num_layers=config.num_decoder_layers, |
|
num_heads=config.num_heads, |
|
relative_attention_num_buckets=config.relative_attention_num_buckets, |
|
relative_attention_max_distance=config.relative_attention_max_distance, |
|
dropout_rate=config.dropout_rate, |
|
layer_norm_epsilon=config.layer_norm_epsilon, |
|
initializer_factor=config.initializer_factor, |
|
feed_forward_proj=config.feed_forward_proj, |
|
use_cache=config.use_cache, |
|
pad_token_id=config.pad_token_id, |
|
eos_token_id=config.eos_token_id, |
|
is_decoder=True, |
|
is_encoder_decoder=False, |
|
), |
|
self.decoder_embeddings, |
|
) |
|
|
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.encoder.get_input_embeddings() |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.encoder.set_input_embeddings(new_embeddings) |
|
|
|
def get_encoder(self): |
|
return self.encoder |
|
|
|
def get_decoder(self): |
|
return self.decoder |
|
|
|
def _prune_heads(self, heads_to_prune): |
|
""" |
|
Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base |
|
class PreTrainedModel |
|
""" |
|
for layer, heads in heads_to_prune.items(): |
|
self.encoder.layer[layer].attention.prune_heads(heads) |
|
|
|
@replace_return_docstrings(output_type=MantaSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
decoder_input_ids: Optional[torch.LongTensor] = None, |
|
decoder_attention_mask: Optional[torch.BoolTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
decoder_head_mask: Optional[torch.FloatTensor] = None, |
|
cross_attn_head_mask: Optional[torch.Tensor] = None, |
|
encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, |
|
past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, |
|
inputs_embeds: Optional[torch.Tensor] = None, |
|
decoder_inputs_embeds: Optional[torch.Tensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
) -> Union[Tuple[torch.FloatTensor], MantaSeq2SeqLMOutput]: |
|
r""" |
|
Returns: |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import ByT5Tokenizer, MantaModel |
|
|
|
>>> tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small") |
|
>>> model = MantaModel.from_pretrained("nthngdy/manta-small") |
|
|
|
>>> input_ids = tokenizer( |
|
... "Studies have been shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 |
|
|
|
>>> # preprocess: Prepend decoder_input_ids with start token which is pad token for MantaModel. |
|
>>> # This is not needed for torch's MantaForConditionalGeneration as it does this internally using labels arg. |
|
>>> decoder_input_ids = model._shift_right(decoder_input_ids) |
|
|
|
>>> # forward pass |
|
>>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) |
|
>>> last_hidden_states = outputs.last_hidden_state |
|
```""" |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
if encoder_outputs is None: |
|
encoder_outputs = self.encoder( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
elif return_dict and not isinstance(encoder_outputs, MantaBaseModelOutput): |
|
encoder_outputs = MantaBaseModelOutput( |
|
last_hidden_state=encoder_outputs[0], |
|
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, |
|
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, |
|
frontier_predictions=encoder_outputs[3] if len(encoder_outputs) > 3 else None, |
|
) |
|
|
|
hidden_states = encoder_outputs[0] |
|
|
|
decoder_outputs = self.decoder( |
|
input_ids=decoder_input_ids, |
|
attention_mask=decoder_attention_mask, |
|
encoder_hidden_states=hidden_states, |
|
encoder_attention_mask=attention_mask, |
|
inputs_embeds=decoder_inputs_embeds, |
|
head_mask=decoder_head_mask, |
|
cross_attn_head_mask=cross_attn_head_mask, |
|
past_key_values=past_key_values, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
if not return_dict: |
|
return decoder_outputs + encoder_outputs |
|
|
|
return MantaSeq2SeqLMOutput( |
|
last_hidden_state=decoder_outputs.last_hidden_state, |
|
past_key_values=decoder_outputs.past_key_values, |
|
decoder_hidden_states=decoder_outputs.hidden_states, |
|
decoder_attentions=decoder_outputs.attentions, |
|
cross_attentions=decoder_outputs.cross_attentions, |
|
encoder_last_hidden_state=encoder_outputs.last_hidden_state, |
|
encoder_hidden_states=encoder_outputs.hidden_states, |
|
encoder_attentions=encoder_outputs.attentions, |
|
frontier_predictions=encoder_outputs.frontier_predictions, |
|
) |
|
|
|
|
|
@add_start_docstrings("""Manta Model with a `language modeling` head on top.""") |
|
class MantaForConditionalGeneration(MantaPreTrainedModel): |
|
_keys_to_ignore_on_load_missing = [ |
|
r"encoder.embed_tokens.weight", |
|
r"decoder.embed_tokens.weight", |
|
r"lm_head.weight", |
|
] |
|
_keys_to_ignore_on_load_unexpected = [ |
|
r"decoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weight", |
|
] |
|
|
|
def __init__(self, config: MantaConfig): |
|
super().__init__(config) |
|
self.model_dim = config.d_model |
|
|
|
self.encoder = MantaEncoderModel(config) |
|
|
|
self.decoder_embeddings = nn.Embedding(config.vocab_size, config.d_model) |
|
self.decoder = T5Stack( |
|
T5Config( |
|
vocab_size=config.vocab_size, |
|
d_model=config.d_model, |
|
d_kv=config.d_kv, |
|
d_ff=config.d_ff, |
|
num_layers=config.num_decoder_layers, |
|
num_heads=config.num_heads, |
|
relative_attention_num_buckets=config.relative_attention_num_buckets, |
|
relative_attention_max_distance=config.relative_attention_max_distance, |
|
dropout_rate=config.dropout_rate, |
|
layer_norm_epsilon=config.layer_norm_epsilon, |
|
initializer_factor=config.initializer_factor, |
|
feed_forward_proj=config.feed_forward_proj, |
|
use_cache=config.use_cache, |
|
pad_token_id=config.pad_token_id, |
|
eos_token_id=config.eos_token_id, |
|
is_decoder=True, |
|
is_encoder_decoder=False, |
|
), |
|
self.decoder_embeddings, |
|
) |
|
|
|
self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) |
|
|
|
|
|
self.post_init() |
|
|
|
def get_input_embeddings(self): |
|
return self.encoder.get_input_embeddings() |
|
|
|
def set_input_embeddings(self, new_embeddings): |
|
self.encoder.set_input_embeddings(new_embeddings) |
|
|
|
def set_output_embeddings(self, new_embeddings): |
|
self.lm_head = new_embeddings |
|
|
|
def get_output_embeddings(self): |
|
return self.lm_head |
|
|
|
def get_encoder(self): |
|
return self.encoder |
|
|
|
def get_decoder(self): |
|
return self.decoder |
|
|
|
@replace_return_docstrings(output_type=MantaSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) |
|
def forward( |
|
self, |
|
input_ids: Optional[torch.LongTensor] = None, |
|
attention_mask: Optional[torch.FloatTensor] = None, |
|
decoder_input_ids: Optional[torch.LongTensor] = None, |
|
decoder_attention_mask: Optional[torch.BoolTensor] = None, |
|
head_mask: Optional[torch.FloatTensor] = None, |
|
decoder_head_mask: Optional[torch.FloatTensor] = None, |
|
cross_attn_head_mask: Optional[torch.Tensor] = None, |
|
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, |
|
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, |
|
inputs_embeds: Optional[torch.FloatTensor] = None, |
|
decoder_inputs_embeds: Optional[torch.FloatTensor] = None, |
|
labels: Optional[torch.LongTensor] = None, |
|
use_cache: Optional[bool] = None, |
|
output_attentions: Optional[bool] = None, |
|
output_hidden_states: Optional[bool] = None, |
|
return_dict: Optional[bool] = None, |
|
) -> Union[Tuple[torch.FloatTensor], MantaSeq2SeqLMOutput]: |
|
r""" |
|
labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): |
|
Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ..., |
|
config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for |
|
labels in `[0, ..., config.vocab_size]` |
|
|
|
Returns: |
|
|
|
Examples: |
|
|
|
```python |
|
>>> from transformers import ByT5Tokenizer, MantaForConditionalGeneration |
|
|
|
>>> tokenizer = ByT5Tokenizer.from_pretrained("google/byt5-small") |
|
>>> model = MantaForConditionalGeneration.from_pretrained("nthngdy/manta-small") |
|
|
|
>>> # training |
|
>>> input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids |
|
>>> labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids |
|
>>> outputs = model(input_ids=input_ids, labels=labels) |
|
>>> loss = outputs.loss |
|
>>> logits = outputs.logits |
|
|
|
>>> # inference |
|
>>> input_ids = tokenizer( |
|
... "summarize: studies have shown that owning a dog is good for you", return_tensors="pt" |
|
... ).input_ids # Batch size 1 |
|
>>> outputs = model.generate(input_ids) |
|
>>> print(tokenizer.decode(outputs[0], skip_special_tokens=True)) |
|
>>> # studies have shown that owning a dog is good for you. |
|
```""" |
|
use_cache = use_cache if use_cache is not None else self.config.use_cache |
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions |
|
output_hidden_states = ( |
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states |
|
) |
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict |
|
|
|
|
|
if head_mask is not None and decoder_head_mask is None: |
|
if self.config.num_layers == self.config.num_decoder_layers: |
|
warnings.warn(__HEAD_MASK_WARNING_MSG, FutureWarning) |
|
decoder_head_mask = head_mask |
|
|
|
|
|
if encoder_outputs is None: |
|
encoder_outputs = self.encoder( |
|
input_ids=input_ids, |
|
attention_mask=attention_mask, |
|
inputs_embeds=inputs_embeds, |
|
head_mask=head_mask, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
elif return_dict and not isinstance(encoder_outputs, MantaBaseModelOutput): |
|
encoder_outputs = BaseModelOutput( |
|
last_hidden_state=encoder_outputs[0], |
|
hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, |
|
attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, |
|
frontier_predictions=encoder_outputs[3] if len(encoder_outputs) > 3 else None, |
|
) |
|
|
|
hidden_states = encoder_outputs[0] |
|
|
|
if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: |
|
|
|
decoder_input_ids = self._shift_right(labels) |
|
|
|
|
|
decoder_outputs = self.decoder( |
|
input_ids=decoder_input_ids, |
|
attention_mask=decoder_attention_mask, |
|
inputs_embeds=decoder_inputs_embeds, |
|
past_key_values=past_key_values, |
|
encoder_hidden_states=hidden_states, |
|
head_mask=decoder_head_mask, |
|
cross_attn_head_mask=cross_attn_head_mask, |
|
use_cache=use_cache, |
|
output_attentions=output_attentions, |
|
output_hidden_states=output_hidden_states, |
|
return_dict=return_dict, |
|
) |
|
|
|
sequence_output = decoder_outputs[0] |
|
|
|
if self.config.tie_word_embeddings: |
|
|
|
|
|
sequence_output = sequence_output * (self.model_dim**-0.5) |
|
|
|
lm_logits = self.lm_head(sequence_output) |
|
|
|
loss = None |
|
if labels is not None: |
|
loss_fct = CrossEntropyLoss(ignore_index=-100) |
|
loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1)) |
|
|
|
|
|
if not return_dict: |
|
output = (lm_logits,) + decoder_outputs[1:] + encoder_outputs |
|
return ((loss,) + output) if loss is not None else output |
|
|
|
return MantaSeq2SeqLMOutput( |
|
loss=loss, |
|
logits=lm_logits, |
|
past_key_values=decoder_outputs.past_key_values, |
|
decoder_hidden_states=decoder_outputs.hidden_states, |
|
decoder_attentions=decoder_outputs.attentions, |
|
cross_attentions=decoder_outputs.cross_attentions, |
|
encoder_last_hidden_state=encoder_outputs.last_hidden_state, |
|
encoder_hidden_states=encoder_outputs.hidden_states, |
|
encoder_attentions=encoder_outputs.attentions, |
|
frontier_predictions=encoder_outputs.frontier_predictions, |
|
) |
|
|
|
def prepare_inputs_for_generation( |
|
self, |
|
input_ids, |
|
past=None, |
|
attention_mask=None, |
|
head_mask=None, |
|
decoder_head_mask=None, |
|
cross_attn_head_mask=None, |
|
use_cache=None, |
|
encoder_outputs=None, |
|
**kwargs |
|
): |
|
|
|
|
|
if past is not None: |
|
input_ids = input_ids[:, -1:] |
|
|
|
return { |
|
"decoder_input_ids": input_ids, |
|
"past_key_values": past, |
|
"encoder_outputs": encoder_outputs, |
|
"attention_mask": attention_mask, |
|
"head_mask": head_mask, |
|
"decoder_head_mask": decoder_head_mask, |
|
"cross_attn_head_mask": cross_attn_head_mask, |
|
"use_cache": use_cache, |
|
} |
|
|
|
def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): |
|
return self._shift_right(labels) |
|
|
|
def _reorder_cache(self, past, beam_idx): |
|
|
|
|
|
if past is None: |
|
logger.warning("You might want to consider setting `use_cache=True` to speed up decoding") |
|
return past |
|
|
|
reordered_decoder_past = () |
|
for layer_past_states in past: |
|
|
|
|
|
reordered_layer_past_states = () |
|
for layer_past_state in layer_past_states: |
|
|
|
reordered_layer_past_states = reordered_layer_past_states + ( |
|
layer_past_state.index_select(0, beam_idx.to(layer_past_state.device)), |
|
) |
|
|
|
assert reordered_layer_past_states[0].shape == layer_past_states[0].shape |
|
assert len(reordered_layer_past_states) == len(layer_past_states) |
|
|
|
reordered_decoder_past = reordered_decoder_past + (reordered_layer_past_states,) |
|
return reordered_decoder_past |