Source code for transformers.models.prophetnet.modeling_prophetnet

# coding=utf-8
# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" PyTorch ProphetNet model, ported from ProphetNet repo(fairsequery_states version). """

import copy
import math
import warnings
from dataclasses import dataclass
from typing import Dict, Optional, Tuple

import torch
import torch.nn.functional as F
from torch import Tensor, nn

from ...activations import ACT2FN
from ...file_utils import (
    ModelOutput,
    add_start_docstrings,
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
)
from ...modeling_outputs import BaseModelOutput
from ...modeling_utils import PreTrainedModel
from ...utils import logging
from .configuration_prophetnet import ProphetNetConfig


logger = logging.get_logger(__name__)

_CONFIG_FOR_DOC = "ProphenetConfig"
_TOKENIZER_FOR_DOC = "ProphetNetTokenizer"

PROPHETNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
    "microsoft/prophetnet-large-uncased",
    # See all ProphetNet models at https://huggingface.co/models?filter=prophetnet
]


PROPHETNET_START_DOCSTRING = r"""
    This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
    methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
    pruning heads etc.)

    Original ProphetNet code can be found at <https://github.com/microsoft/ProphetNet> . Checkpoints were converted
    from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
    file ``convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py``.

    This model is a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`_ sub-class. Use
    it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
    behavior.

    Parameters:
        config (:class:`~transformers.ProphetNetConfig`): Model configuration class with all the parameters of the model.
            Initializing with a config file does not load the weights associated with the model, only the
            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
            weights.
"""

PROPHETNET_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        decoder_input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, target_sequence_length)`, `optional`):
            Provide for translation and summarization training. By default, the model will create this tensor by
            shifting the :obj:`input_ids` to the right, following the paper.
        decoder_attention_mask (:obj:`torch.BoolTensor` of shape :obj:`(batch_size, tgt_seq_len)`, `optional`):
            Default behavior: generate a tensor that ignores pad tokens in :obj:`decoder_input_ids`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read :func:`modeling_bart._prepare_decoder_inputs` and
            modify to your needs. See diagram 1 in `the paper <https://arxiv.org/abs/1910.13461>`__ for more
            information on the default strategy.
        encoder_outputs (:obj:`tuple(tuple(torch.FloatTensor)`, `optional`):
            Tuple consists of (:obj:`last_hidden_state`, `optional`: :obj:`hidden_states`, `optional`:
            :obj:`attentions`) :obj:`last_hidden_state` of shape :obj:`(batch_size, sequence_length, hidden_size)`,
            `optional`) is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
            cross-attention of the decoder.
        past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.

            If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids``
            (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)`
            instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`.
        use_cache (:obj:`bool`, `optional`):
            If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up
            decoding (see :obj:`past_key_values`).
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""

PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
    Args:
        input_ids (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using :class:`~transformers.ProphetNetTokenizer`. See
            :meth:`transformers.PreTrainedTokenizer.encode` and :meth:`transformers.PreTrainedTokenizer.__call__` for
            details.

            `What are input IDs? <../glossary.html#input-ids>`__
        attention_mask (:obj:`torch.Tensor` of shape :obj:`(batch_size, sequence_length)`, `optional`):
            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            `What are attention masks? <../glossary.html#attention-mask>`__
        output_attentions (:obj:`bool`, `optional`):
            Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
            tensors for more detail.
        output_hidden_states (:obj:`bool`, `optional`):
            Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
            more detail.
        return_dict (:obj:`bool`, `optional`):
            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
"""


def softmax(hidden_state, dim, onnx_trace=False):
    if onnx_trace:
        return F.softmax(hidden_state.float(), dim=dim)
    else:
        return F.softmax(hidden_state, dim=dim, dtype=torch.float32)


def ngram_attention_bias(sequence_length, ngram, device, dtype):
    """
    This function computes the bias for the predict stream
    """
    bias = torch.ones((ngram, sequence_length, 2 * sequence_length), device=device, dtype=dtype) * float("-inf")
    # create bias
    for stream_idx in range(ngram):
        for i in range(sequence_length):
            bias[stream_idx, i, sequence_length + i] = 0
            bias[stream_idx, i, : max(i - stream_idx, 0) + 1] = 0
    return bias


def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
    """
    This function computes individual parts of the relative position buckets. For more detail, see paper.
    """
    inv_relative_positions = -relative_positions
    rel_positions_bucket = 0

    if is_bidirectional:
        num_buckets = num_buckets // 2
        rel_positions_bucket = (
            rel_positions_bucket
            + torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets
        )
        inv_relative_positions = torch.abs(inv_relative_positions)
    else:
        inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions))

    max_exact = num_buckets // 2
    is_small = torch.lt(inv_relative_positions, max_exact)
    val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log(
        max_distance / max_exact
    ) * (num_buckets - max_exact)
    val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int()
    rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large)
    return rel_positions_bucket


def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
    """
    This function computes both main and predict relative position buckets. For more detail, see paper.
    """
    # main stream
    main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
    main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)

    # predicting stream
    predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
    predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
    predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)

    # get both position buckets
    main_relative_position_buckets = compute_relative_buckets(
        num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
    )
    predict_relative_position_buckets = compute_relative_buckets(
        num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
    )
    return main_relative_position_buckets, predict_relative_position_buckets


[docs]@dataclass class ProphetNetSeq2SeqLMOutput(ModelOutput): """ Base class for sequence-to-sequence language models outputs. Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Language modeling loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`): Prediction scores of the main stream language modeling head (scores for each vocabulary token before SoftMax). logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): Prediction scores of the predict stream language modeling head (scores for each vocabulary token before SoftMax). past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`. Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`. Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding outputs. decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`. Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the self-attention heads. """ loss: Optional[torch.FloatTensor] = None logits: torch.FloatTensor = None logits_ngram: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @property def decoder_cross_attentions(self): warnings.warn( "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.", FutureWarning, ) return self.cross_attentions
[docs]@dataclass class ProphetNetSeq2SeqModelOutput(ModelOutput): """ Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential decoding. Args: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`): Sequence of main stream hidden-states at the output of the last layer of the decoder of the model. If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size,ngram * decoder_sequence_length, config.vocab_size)`): Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model. past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. decoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`. Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. decoder_ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`. Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding outputs. decoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. decoder_ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the weighted average in the cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`. Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the encoder_last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder of the model. encoder_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, encoder_sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. encoder_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the self-attention heads. """ last_hidden_state: torch.FloatTensor last_hidden_state_ngram: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[torch.FloatTensor]] = None decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_ngram_hidden_states: Optional[Tuple[torch.FloatTensor]] = None decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None decoder_ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None encoder_last_hidden_state: Optional[torch.FloatTensor] = None encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None @property def decoder_cross_attentions(self): warnings.warn( "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions` instead.", FutureWarning, ) return self.cross_attentions
[docs]@dataclass class ProphetNetDecoderModelOutput(ModelOutput): """ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Args: last_hidden_state (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`): Sequence of main stream hidden-states at the output of the last layer of the decoder of the model. If :obj:`past_key_values` is used only the last hidden-state of the sequences of shape :obj:`(batch_size, 1, hidden_size)` is output. last_hidden_state_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model. past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`. Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`. Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the weighted average in the cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`. Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the """ last_hidden_state: torch.FloatTensor last_hidden_state_ngram: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
[docs]@dataclass class ProphetNetDecoderLMOutput(ModelOutput): """ Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). Args: loss (:obj:`torch.FloatTensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` is provided): Language modeling loss. logits (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, decoder_sequence_length, config.vocab_size)`): Prediction scores of the main stream language modeling head (scores for each vocabulary token before SoftMax). logits_ngram (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, ngram * decoder_sequence_length, config.vocab_size)`): Prediction scores of the predict stream language modeling head (scores for each vocabulary token before SoftMax). past_key_values (:obj:`List[torch.FloatTensor]`, `optional`, returned when ``use_cache=True`` is passed or when ``config.use_cache=True``): List of :obj:`torch.FloatTensor` of length :obj:`config.n_layers`, with each tensor of shape :obj:`(2, batch_size, num_attn_heads, decoder_sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be used (see :obj:`past_key_values` input) to speed up sequential decoding. hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, decoder_sequence_length, hidden_size)`. Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs. ngram_hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of shape :obj:`(batch_size, ngram * decoder_sequence_length, hidden_size)`. Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding outputs. attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the self-attention heads. ngram_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, decoder_sequence_length, decoder_sequence_length)`. Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the weighted average in the cross_attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape :obj:`(batch_size, num_attn_heads, encoder_sequence_length, decoder_sequence_length)`. Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to compute the weighted average in the """ loss: Optional[torch.FloatTensor] = None logits: torch.FloatTensor = None logits_ngram: Optional[torch.FloatTensor] = None past_key_values: Optional[Tuple[torch.FloatTensor]] = None hidden_states: Optional[Tuple[torch.FloatTensor]] = None hidden_states_ngram: Optional[Tuple[torch.FloatTensor]] = None attentions: Optional[Tuple[torch.FloatTensor]] = None ngram_attentions: Optional[Tuple[torch.FloatTensor]] = None cross_attentions: Optional[Tuple[torch.FloatTensor]] = None
def ProphetNetLayerNorm(normalized_shape, eps=1e-5, elementwise_affine=True): if torch.cuda.is_available(): try: from apex.normalization import FusedProphetNetLayerNorm return FusedProphetNetLayerNorm(normalized_shape, eps, elementwise_affine) except ImportError: pass return torch.nn.LayerNorm(normalized_shape, eps, elementwise_affine) class ProphetNetPreTrainedModel(PreTrainedModel): config_class = ProphetNetConfig base_model_prefix = "prophetnet" def _init_weights(self, module): if isinstance(module, nn.Linear): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, nn.Embedding): module.weight.data.normal_(mean=0.0, std=self.config.init_std) if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() def _shift_right(self, input_ids): decoder_start_token_id = self.config.decoder_start_token_id pad_token_id = self.config.pad_token_id assert ( decoder_start_token_id is not None ), "self.model.config.decoder_start_token_id has to be defined. In ProphetNet it is usually set to the pad_token_id. See ProphetNet docs for more information" # shift inputs to the right shifted_input_ids = input_ids.new_zeros(input_ids.shape) shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() shifted_input_ids[..., 0] = decoder_start_token_id assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined." # replace possible -100 values in labels by `pad_token_id` shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values" return shifted_input_ids class ProhpetNetPositionalEmbeddings(nn.Embedding): """ This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to the forward function. """ def __init__(self, config: ProphetNetConfig): super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id) def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None): assert (position_ids is None) or ( self.padding_idx is None ), "If position_ids is pre-computed then padding_idx should not be set." if position_ids is None: if past_key_values is not None: # position_ids is the same for every token when decoding a single step # Without the int() cast, it doesn't work in some cases when exporting to ONNX prev_num_input_ids = past_key_values[0]["self"]["prev_key_states"].shape[2] num_input_ids = inputs_shape[1] + prev_num_input_ids position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * ( int(self.padding_idx + num_input_ids) ) else: if attention_mask is None: attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device) # retrieve position_ids from input_ids / attention_mask position_ids = ( torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask ).long() + self.padding_idx return super().forward(position_ids), position_ids def _forward(self, position_ids): return super().forward(position_ids) class ProphetNetSelfAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__( self, config: ProphetNetConfig, num_attn_heads: int, ): super().__init__() hidden_size = config.hidden_size self.attention_dropout = config.attention_dropout self.dropout = config.dropout self.num_attn_heads = num_attn_heads self.head_dim = hidden_size // num_attn_heads assert ( self.head_dim * num_attn_heads == hidden_size ), "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and `config.num_decoder_attention_heads`" self.key_proj = nn.Linear(hidden_size, hidden_size) self.value_proj = nn.Linear(hidden_size, hidden_size) self.query_proj = nn.Linear(hidden_size, hidden_size) self.out_proj = nn.Linear(hidden_size, hidden_size) def _reshape(self, tensor, first_dim, batch_size): return tensor.reshape(first_dim, batch_size * self.num_attn_heads, self.head_dim).transpose(0, 1) def forward( self, hidden_states, key_value_states: Optional[Tensor] = None, attention_mask: Optional[Tensor] = None, layer_state: Optional[Dict[str, Optional[Tensor]]] = None, ) -> Tuple[Tensor, Optional[Tensor]]: sequence_length, batch_size, hidden_size = hidden_states.size() # if key_value_states are provided this layer is used as a cross-attention layer # for the decoder is_cross_attention = key_value_states is not None cache_key = "cross_attention" if is_cross_attention else "self" assert list(hidden_states.size()) == [ sequence_length, batch_size, hidden_size, ], f"Size of hidden states should be {sequence_length, batch_size, hidden_size}, but is {hidden_states.size()}" # previous time steps are cached - no need to recompute key and value if they are static if layer_state is not None: saved_state = layer_state.get(cache_key, None) query_states = self.query_proj(hidden_states) / (self.head_dim ** 0.5) query_states = self._reshape(query_states, sequence_length, batch_size) if not is_cross_attention: # self-attention key_states = self.key_proj(hidden_states) key_states = self._reshape(key_states, -1, batch_size) value_states = self.value_proj(hidden_states) value_states = self._reshape(value_states, -1, batch_size) elif saved_state is None: # cross-attention without layer state key_states = self.key_proj(key_value_states) key_states = self._reshape(key_states, -1, batch_size) value_states = self.value_proj(key_value_states) value_states = self._reshape(value_states, -1, batch_size) else: key_states = saved_state["prev_key_states"].view(batch_size * self.num_attn_heads, -1, self.head_dim) value_states = saved_state["prev_value_states"].view(batch_size * self.num_attn_heads, -1, self.head_dim) # Update cache if is_cross_attention: layer_state[cache_key] = { "prev_key_states": key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim), "prev_value_states": value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim), } key_sequence_length = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) assert attn_weights.size() == ( batch_size * self.num_attn_heads, sequence_length, key_sequence_length, ), f"`attn_weights` should be of size {batch_size * self.num_attn_heads, sequence_length, key_sequence_length}, but is of size {attn_weights.shape}" # This is part of a workaround to get around fork/join parallelism not supporting Optional types. if attention_mask is not None and attention_mask.dim() == 0: attention_mask = None assert attention_mask is None or attention_mask.size() == ( self.num_attn_heads * batch_size, 1, key_sequence_length, ), f"`attention_mask` should be `None` or of shape attention_mask.size() == {batch_size * self.num_attn_heads, 1, key_sequence_length}, but is {attention_mask.shape}" if attention_mask is not None: # don't attend to padding symbols attn_weights = attn_weights + attention_mask # need two reshapes to keep gradient at attention weights attn_weights_reshaped = attn_weights.view( batch_size, self.num_attn_heads, sequence_length, key_sequence_length ) attn_weights = attn_weights_reshaped.view( batch_size * self.num_attn_heads, sequence_length, key_sequence_length ) attn_weights = F.softmax(attn_weights, dim=-1) attn_probs = F.dropout( attn_weights, p=self.attention_dropout, training=self.training, ) attn_output = torch.bmm(attn_probs, value_states) assert attn_output.size() == ( batch_size * self.num_attn_heads, sequence_length, self.head_dim, ), "`attn_output` should be of shape {batch_size * self.num_attn_heads, sequence_length, self.head_dim}, but is of shape {attn_output.size()}" attn_output = attn_output.transpose(0, 1).contiguous().view(sequence_length, batch_size, hidden_size) attn_output = self.out_proj(attn_output) attn_output = F.dropout(attn_output, p=self.dropout, training=self.training) return attn_output, attn_weights_reshaped class ProhpetNetFeedForward(nn.Module): """ This is the residual two feed-forward layer block based on the original Transformer implementation. """ def __init__(self, config: ProphetNetConfig, ffn_dim: int): super().__init__() self.activation_fn = ACT2FN[config.activation_function] self.intermediate = nn.Linear(config.hidden_size, ffn_dim) self.output = nn.Linear(ffn_dim, config.hidden_size) self.activation_dropout = config.activation_dropout self.dropout = config.dropout def forward(self, hidden_states): hidden_states = self.intermediate(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = F.dropout(hidden_states, p=self.activation_dropout, training=self.training) hidden_states = self.output(hidden_states) hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) return hidden_states class ProphetNetNgramProphetNetSelfAttention(nn.Module): def __init__(self, config: ProphetNetConfig): super().__init__() self.hidden_size = config.hidden_size self.num_buckets = config.num_buckets self.relative_max_distance = config.relative_max_distance self.num_attn_heads = config.num_attention_heads self.dropout = config.dropout self.attention_dropout = config.attention_dropout self.head_dim = config.hidden_size // self.num_attn_heads self.ngram = config.ngram assert ( self.head_dim * self.num_attn_heads == config.hidden_size ), "config.hidden_size must be divisible by num_attn_heads" # key, value, query projection self.key_proj = nn.Linear(config.hidden_size, config.hidden_size) self.value_proj = nn.Linear(config.hidden_size, config.hidden_size) self.query_proj = nn.Linear(config.hidden_size, config.hidden_size) # out projection self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) # rel position embeddings self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads) # for onnx runtime self.onnx_trace = False def _reshape(self, tensor, first_dim, batch_size): return tensor.reshape(first_dim, batch_size * self.num_attn_heads, self.head_dim).transpose(0, 1) def prepare_for_onnx_export_(self): self.onnx_trace = True def forward( self, hidden_states, layer_state=None, attention_mask=None, extended_predict_attention_mask=None, main_relative_position_buckets=None, predict_relative_position_buckets=None, position_ids=None, ): sequence_length, batch_size, hidden_size = hidden_states.size() assert list(hidden_states.size()) == [ sequence_length, batch_size, hidden_size, ], f"`hidden_states` should be of shape {sequence_length, batch_size, hidden_size}, but is of shape {hidden_states.shape}" # key and value of previous time steps are cached saved_state = layer_state.get("self", None) # project query_states = self.query_proj(hidden_states) key_states = self.key_proj(hidden_states) value_states = self.value_proj(hidden_states) # normalize query_states = query_states / (self.head_dim ** 0.5) # reshape query_states = self._reshape(query_states, sequence_length, batch_size) key_states = self._reshape(key_states, -1, batch_size) value_states = self._reshape(value_states, -1, batch_size) # chunk into main stream and predict stream hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=0) query_states_list = query_states.chunk(1 + self.ngram, dim=1) key_states_list = key_states.chunk(1 + self.ngram, dim=1) value_states_list = value_states.chunk(1 + self.ngram, dim=1) main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:] main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:] main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:] main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:] # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim) if saved_state is not None: prev_main_key_states = saved_state["prev_key_states"].view( batch_size * self.num_attn_heads, -1, self.head_dim ) main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=1) prev_main_value_states = saved_state["prev_value_states"].view( batch_size * self.num_attn_heads, -1, self.head_dim ) main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=1) # Update cache layer_state["self"] = { "prev_key_states": main_key_states.view(batch_size, self.num_attn_heads, -1, self.head_dim), "prev_value_states": main_value_states.view(batch_size, self.num_attn_heads, -1, self.head_dim), } # get seq_length of main stream only main_sequence_length = sequence_length // (1 + self.ngram) # MAIN-STREAM # main attn weights main_attn_weights = torch.bmm(main_query_states, main_key_states.transpose(1, 2)) # retrieve relative position embeddings for each layer -> see paper for more details main_relative_pos_embeddings = self.get_main_relative_pos_embeddings( main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets ) main_attn_weights = main_attn_weights + main_relative_pos_embeddings if attention_mask is not None: main_attn_weights = main_attn_weights + attention_mask main_attn_probs = softmax( main_attn_weights, dim=-1, onnx_trace=self.onnx_trace, ).type_as(main_attn_weights) main_attn_probs = F.dropout(main_attn_probs, p=self.attention_dropout, training=self.training) # project to attn_output main_attn_output = torch.bmm(main_attn_probs, main_value_states) main_attn_output = ( main_attn_output.transpose(0, 1).contiguous().view(1, main_sequence_length, batch_size, hidden_size) ) main_attn_output = self.out_proj(main_attn_output) # PREDICT-STREAM # [ngram, B*head, T, c] predict_query_states = torch.cat(predict_query_states_list, 0).view( self.ngram, -1, main_sequence_length, self.head_dim ) # [ngram, B*head, 2*T, c] predict_key_states = torch.cat( [torch.cat([main_key_states, key], 1).unsqueeze(0) for key in predict_key_states_list], 0 ) # [ngram, T, B, C] predict_hidden_states = torch.cat(hidden_states_predict_list, 0).view( self.ngram, main_sequence_length, batch_size, hidden_size ) # [ngram, B*head, 2*T, c] predict_value_states = torch.cat( [torch.cat([main_value_states, v_p], 1).unsqueeze(0) for v_p in predict_value_states_list], 0 ) # [ngram, B*head, T, 2*T] predict_attn_weights = torch.einsum("nbtc,nbsc->nbts", (predict_query_states, predict_key_states)) # [ngram, B*head, T, S] # retrieve relative position embeddings for each layer -> see paper for more details predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings( predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets ) # [ngram, B*head, T, 2*T] predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings if extended_predict_attention_mask is not None: predict_attn_weights = predict_attn_weights + extended_predict_attention_mask predict_attn_probs = softmax( predict_attn_weights, dim=-1, onnx_trace=self.onnx_trace, ).type_as(predict_attn_weights) predict_attn_probs = F.dropout(predict_attn_probs, p=self.attention_dropout, training=self.training) # project to attention output # [ngram, B*head, T, c] predict_attn_output = torch.einsum("nbts,nbsc->nbtc", (predict_attn_probs, predict_value_states)) # [ngram, T, B, C] predict_attn_output = ( predict_attn_output.transpose(1, 2) .contiguous() .view(self.ngram, main_sequence_length, batch_size, hidden_size) ) predict_attn_output = self.out_proj(predict_attn_output) # concat to single attn output # [1+ngram*T, B, C] attn_output = torch.cat([main_attn_output, predict_attn_output], 0).view(-1, batch_size, hidden_size) # reshape into better form for `config.output_attentions` main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, main_sequence_length, -1) predict_attn_probs = predict_attn_probs.view( self.ngram, batch_size, self.num_attn_heads, main_sequence_length, -1 ).transpose(0, 1) attn_output = F.dropout(attn_output, p=self.dropout, training=self.training) return attn_output, main_attn_probs, predict_attn_probs def get_main_relative_pos_embeddings( self, hidden_states, attn_weights, position_ids, main_relative_position_buckets ): # input hidden_states [T,B,C], input attn_weights [T*head,T,S], input position_ids [B,T] or [1,1] if main_relative_position_buckets is None: batch_size, sequence_length = hidden_states.shape[:2] relative_positions = ( torch.arange(1, attn_weights.shape[-1] + 1) .unsqueeze(0) .unsqueeze(0) .repeat(batch_size, sequence_length, 1) .to(position_ids.device) ) relative_positions = relative_positions - position_ids.unsqueeze(0).repeat( batch_size, sequence_length, 1 ) # [B, T, s] main_relative_position_buckets = compute_relative_buckets( self.num_buckets, self.relative_max_distance, relative_positions, False ) hidden_states = hidden_states.transpose(0, 1) # [B,T,C] rel_pos_embeddings = self.relative_pos_embeddings(hidden_states) # [B,T,Buckets*head] rel_pos_embeddings = rel_pos_embeddings.view( rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads) ).permute( 0, 3, 1, 2 ) # [B,T,Buckets,head] rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:2] + (-1,)) # [B*head,T,Buckets] main_relative_position_buckets = ( main_relative_position_buckets.repeat(1, self.num_attn_heads, 1) .view(-1, main_relative_position_buckets.shape[-1]) .long() ) # [B*head*T, T] rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1)) # [B*head*T,Buckets] main_relative_pos_embeddings = torch.gather( rel_pos_embeddings, dim=1, index=main_relative_position_buckets ).view(attn_weights.shape[:2] + (-1,)) return main_relative_pos_embeddings def get_predict_relative_pos_embeddings( self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets ): # input hidden_states [ngram, T,B,C], input attn_weights [ngram, B*head,T,S], input position_ids [B,T] or [1,1], input predict_relative_position_buckets [B,T, 2*T] or None sequence_length, batch_size = hidden_states.shape[1:3] if predict_relative_position_buckets is None: key_sequence_length = attn_weights.shape[-1] assert ( position_ids[0][0] == key_sequence_length - 1 ), "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)" relative_positions = ( torch.arange(0, key_sequence_length) .unsqueeze(0) .unsqueeze(0) .repeat(batch_size, sequence_length, 1) .to(position_ids.device) ) relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1) predict_relative_position_buckets = compute_relative_buckets( self.num_buckets, self.relative_max_distance, relative_positions, False ) hidden_states = hidden_states.transpose(1, 2) # [ngram, B, T, C] rel_pos_embeddings = self.relative_pos_embeddings(hidden_states).view( hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads) ) # [ngram, B, T, bucket, head] rel_pos_embeddings = rel_pos_embeddings.permute(0, 1, 4, 2, 3).reshape( self.ngram * batch_size * self.num_attn_heads, sequence_length, -1 ) # [ngram*B*head, T, bucket] predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0).repeat( self.ngram, 1, self.num_attn_heads, 1 ) # [ngram, B, head*T, S] rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1)) predict_relative_position_buckets = predict_relative_position_buckets.view( -1, predict_relative_position_buckets.size(-1) ).long() # [ngram*B*head*T, S] predict_relative_pos_embeddings = torch.gather( rel_pos_embeddings, dim=1, index=predict_relative_position_buckets ).view( self.ngram, batch_size * self.num_attn_heads, sequence_length, -1 ) # [ngram, B*head, T, S] return predict_relative_pos_embeddings class ProphetNetEncoderLayer(nn.Module): """ Encoder block for Prophetnet """ def __init__(self, config: ProphetNetConfig): super().__init__() # 1st residual block self.self_attn = ProphetNetSelfAttention(config, config.num_encoder_attention_heads) self.self_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size) # 2nd residual block self.feed_forward = ProhpetNetFeedForward(config, config.encoder_ffn_dim) self.feed_forward_layer_norm = ProphetNetLayerNorm(config.hidden_size) def forward(self, hidden_states, attention_mask): # 1st residual block attention_output, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, ) hidden_states = self.self_attn_layer_norm(attention_output + hidden_states) # 2nd residual block feed_forward_output = self.feed_forward(hidden_states) hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states) return hidden_states, attn_weights class ProphetNetDecoderLayer(nn.Module): """ Decoder block for Prophetnet """ def __init__(self, config: ProphetNetConfig): super().__init__() # 1st residual block self.self_attn = ProphetNetNgramProphetNetSelfAttention(config) self.self_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size) # 2nd residual block if config.add_cross_attention: self.cross_attn = ProphetNetSelfAttention(config, config.num_decoder_attention_heads) self.cross_attn_layer_norm = ProphetNetLayerNorm(config.hidden_size) # 3rd residual block self.feed_forward = ProhpetNetFeedForward(config, config.decoder_ffn_dim) self.feed_forward_layer_norm = ProphetNetLayerNorm(config.hidden_size) def forward( self, hidden_states, encoder_hidden_states=None, encoder_attn_mask=None, layer_state=None, attention_mask=None, extended_predict_attention_mask=None, main_relative_position_buckets=None, predict_relative_position_buckets=None, position_ids=None, ): layer_state = layer_state if layer_state is not None else {} # 1st residual block ngram_attention_output, self_attn_weights, self_attn_weights_ngram = self.self_attn( hidden_states=hidden_states, layer_state=layer_state, attention_mask=attention_mask, extended_predict_attention_mask=extended_predict_attention_mask, main_relative_position_buckets=main_relative_position_buckets, predict_relative_position_buckets=predict_relative_position_buckets, position_ids=position_ids, ) hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output) cross_attn_weights = None if encoder_hidden_states is not None: # 2nd residual block attention_output, cross_attn_weights = self.cross_attn( hidden_states=hidden_states, key_value_states=encoder_hidden_states, attention_mask=encoder_attn_mask, layer_state=layer_state, # mutates layer state ) hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states) # 3rd residual block feed_forward_output = self.feed_forward(hidden_states) hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states) return ( hidden_states, self_attn_weights, self_attn_weights_ngram, cross_attn_weights, layer_state, ) # just self_attn weights for now, following t5, layer_state = cache for decoding
[docs]@add_start_docstrings( "The standalone encoder part of the ProphetNetModel.", PROPHETNET_START_DOCSTRING, ) class ProphetNetEncoder(ProphetNetPreTrainedModel): r""" word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`): The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with pre-defined word embeddings instead of randomely initialized word embeddings. """ def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None): super().__init__(config) self.word_embeddings = ( word_embeddings if word_embeddings is not None else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) ) self.position_embeddings = ProhpetNetPositionalEmbeddings(config) self.embeddings_layer_norm = ProphetNetLayerNorm(config.hidden_size) self.layers = nn.ModuleList([ProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)]) self.init_weights() def get_input_embeddings(self): return self.word_embeddings def set_input_embeddings(self, value): self.word_embeddings = value
[docs] @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Returns: Example:: >>> from transformers import ProphetNetTokenizer, ProphetNetEncoder >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetEncoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone') >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None and inputs_embeds is None: raise ValueError("Either input_ids or inputs_embeds has to be passed.") elif input_ids is not None and inputs_embeds is not None: raise ValueError("Make sure to only pass input_ids or inputs_embeds.") elif input_ids is not None and inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) # prepare attention mask if attention_mask is not None: extended_attention_mask = ( 1.0 - attention_mask[:, None, :].repeat(self.config.num_attention_heads, 1, 1) ) * -10000.0 extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype) else: extended_attention_mask = None position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2], inputs_embeds.device) hidden_states = inputs_embeds + position_embeddings hidden_states = self.embeddings_layer_norm(hidden_states) hidden_states = F.dropout(hidden_states, p=self.config.dropout, training=self.training) hidden_states = hidden_states.transpose(0, 1) # B x T x C -> T x B x C encoder_hidden_states = () if output_hidden_states else None all_attentions = () if output_attentions else None for encoder_layer in self.layers: if output_hidden_states: hidden_states = hidden_states.transpose(0, 1) encoder_hidden_states = encoder_hidden_states + (hidden_states,) hidden_states = hidden_states.transpose(0, 1) hidden_states, attn_probs = encoder_layer(hidden_states, attention_mask=extended_attention_mask) if output_attentions: all_attentions = all_attentions + (attn_probs,) hidden_states = hidden_states.transpose(0, 1) if output_hidden_states: encoder_hidden_states = encoder_hidden_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, encoder_hidden_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_hidden_states, attentions=all_attentions )
[docs]@add_start_docstrings( "The standalone decoder part of the ProphetNetModel.", PROPHETNET_START_DOCSTRING, ) class ProphetNetDecoder(ProphetNetPreTrainedModel): r""" word_embeddings (:obj:`torch.nn.Embeddings` of shape :obj:`(config.vocab_size, config.hidden_size)`, `optional`): The word embedding parameters. This can be used to initialize :class:`~transformers.ProphetNetEncoder` with pre-defined word embeddings instead of randomely initialized word embeddings. """ def __init__(self, config: ProphetNetConfig, word_embeddings: nn.Embedding = None): super().__init__(config) self.ngram = config.ngram self.num_buckets = config.num_buckets self.relative_max_distance = config.relative_max_distance self.dropout = config.dropout self.max_target_positions = config.max_position_embeddings self.word_embeddings = ( word_embeddings if word_embeddings is not None else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) ) self.position_embeddings = ProhpetNetPositionalEmbeddings(config) self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None) self.layers = nn.ModuleList([ProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)]) self.embeddings_layer_norm = ProphetNetLayerNorm(config.hidden_size) self.init_weights() def get_input_embeddings(self): return self.word_embeddings def set_input_embeddings(self, value): self.word_embeddings = value
[docs] @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. Returns: Example:: >>> from transformers import ProphetNetTokenizer, ProphetNetDecoder >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetDecoder.from_pretrained('patrickvonplaten/prophetnet-large-uncased-standalone', add_cross_attention=False) >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_states = outputs.last_hidden_state """ use_cache = use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if input_ids is None and inputs_embeds is None: raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.") elif input_ids is not None and inputs_embeds is not None: raise ValueError("Make sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.") elif input_ids is not None and inputs_embeds is None: inputs_embeds = self.word_embeddings(input_ids) batch_size, sequence_length = inputs_embeds.shape[:2] main_stream_pos_embed, position_ids = self.position_embeddings( (batch_size, sequence_length), device=inputs_embeds.device, past_key_values=past_key_values, ) if past_key_values is not None: main_relative_position_buckets, predict_relative_position_buckets = None, None else: ( main_relative_position_buckets, predict_relative_position_buckets, ) = self.compute_buffered_relative_buckets(position_ids) predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1) # add position embeddings hidden_states = inputs_embeds + main_stream_pos_embed hidden_states = hidden_states.transpose(0, 1) ngram_embeddings = self.ngram_embeddings.weight # prepare attention mask if past_key_values is not None: assert ( hidden_states.size(0) == 1 ), "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1" ngram_hidden_states = [ (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).transpose(0, 1).repeat(1, batch_size, 1) for ngram in range(self.ngram) ] extended_attention_mask = None extended_predict_attention_mask = None else: ngram_hidden_states = [ (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).transpose(0, 1) for ngram in range(self.ngram) ] extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask) extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask) # prepare encoder attention mask if encoder_attention_mask is not None: extended_encoder_attention_mask = ( 1.0 - encoder_attention_mask[:, None, :].repeat(self.config.num_attention_heads, 1, 1) ) * -10000.0 extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype) else: extended_encoder_attention_mask = None hidden_states = torch.cat([hidden_states] + ngram_hidden_states, 0) if self.embeddings_layer_norm: hidden_states = self.embeddings_layer_norm(hidden_states) hidden_states = F.dropout(hidden_states, p=self.dropout, training=self.training) if encoder_hidden_states is not None: encoder_hidden_states = encoder_hidden_states.transpose(0, 1) # init attentions, hidden_states and cache with empty tuples all_main_stream_hidden_states = () if output_hidden_states else None all_ngram_stream_hidden_states = () if output_hidden_states and self.config.ngram > 0 else None all_main_stream_attns = () if output_attentions else None all_ngram_stream_attns = () if output_attentions else None all_cross_attns = () if output_attentions and self.config.add_cross_attention else None present_key_values = () if use_cache else None for idx, decoder_layer in enumerate(self.layers): if output_hidden_states: # grad cannot be kept because tensor is sliced all_main_stream_hidden_states += (hidden_states[:sequence_length].transpose(0, 1),) if self.config.ngram > 0: all_ngram_stream_hidden_states += (hidden_states[sequence_length:].transpose(0, 1),) layer_state = past_key_values[idx] if past_key_values is not None else None ( hidden_states, layer_self_attn, layer_self_predict_attn_output, layer_cross_attn, layer_past, ) = decoder_layer( hidden_states, encoder_hidden_states=encoder_hidden_states, encoder_attn_mask=extended_encoder_attention_mask, layer_state=layer_state, attention_mask=extended_attention_mask, extended_predict_attention_mask=extended_predict_attention_mask, main_relative_position_buckets=main_relative_position_buckets, predict_relative_position_buckets=predict_relative_position_buckets, position_ids=position_ids, ) if use_cache: present_key_values += (layer_past,) if output_attentions: all_main_stream_attns += (layer_self_attn,) all_ngram_stream_attns += (layer_self_predict_attn_output,) if self.config.add_cross_attention: all_cross_attns += (layer_cross_attn,) if output_hidden_states: all_main_stream_hidden_states += (hidden_states[:sequence_length].transpose(0, 1),) if self.config.ngram > 0: all_ngram_stream_hidden_states += (hidden_states[sequence_length:].transpose(0, 1),) # split last_hidden_state for return last_hidden_state = hidden_states[:sequence_length].transpose(0, 1) last_hidden_state_ngram = hidden_states[sequence_length:].transpose(0, 1) if self.config.ngram > 0 else None encoder_hidden_states = encoder_hidden_states.transpose(0, 1) if encoder_hidden_states is not None else None if not return_dict: return tuple( v for v in [ last_hidden_state, last_hidden_state_ngram, present_key_values, all_main_stream_hidden_states, all_ngram_stream_hidden_states, all_main_stream_attns, all_ngram_stream_attns, all_cross_attns, ] if v is not None ) return ProphetNetDecoderModelOutput( last_hidden_state=last_hidden_state, last_hidden_state_ngram=last_hidden_state_ngram, past_key_values=present_key_values, hidden_states=all_main_stream_hidden_states, hidden_states_ngram=all_ngram_stream_hidden_states, attentions=all_main_stream_attns, ngram_attentions=all_ngram_stream_attns, cross_attentions=all_cross_attns, )
def compute_buffered_relative_buckets(self, position_ids): batch_size, sequence_length = position_ids.shape position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1) main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets( self.num_buckets, self.relative_max_distance, position_ids ) # buffer relative buckets main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1) predict_relative_buckets = torch.cat( [ predict_relative_buckets[:, :sequence_length, :sequence_length], predict_relative_buckets[ :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length ], ], 2, ).repeat(batch_size, 1, 1) return main_relative_buckets, predict_relative_buckets def prepare_attention_mask(self, hidden_states, attention_mask): seq_length, batch_size = hidden_states.shape[:2] # get causal mask causal_mask = hidden_states.new(seq_length, seq_length).float().fill_(-float("inf")) causal_mask = torch.triu(causal_mask, 1) extended_causal_mask = causal_mask[:seq_length, :seq_length][None, :, :].expand( (batch_size,) + causal_mask.shape ) # add usual attention mask if attention_mask is not None: extended_attention_mask = (1.0 - attention_mask[:, None, :]) * -10000.0 extended_attention_mask = extended_causal_mask + extended_attention_mask else: extended_attention_mask = extended_causal_mask return extended_attention_mask.repeat(self.config.num_decoder_attention_heads, 1, 1).to(hidden_states.dtype) def prepare_predict_attention_mask(self, hidden_states, attention_mask): seq_length, batch_size = hidden_states.shape[:2] # get causal mask predict_causal_mask = ngram_attention_bias( self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype ) predict_causal_mask = torch.cat( [ predict_causal_mask[:, :seq_length, :seq_length], predict_causal_mask[ :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length ], ], dim=-1, ) extended_predict_causal_mask = predict_causal_mask[:, None, :, :].expand( predict_causal_mask.shape[:1] + (batch_size,) + predict_causal_mask.shape[1:] ) # add usual attention mask if attention_mask is not None: extended_attention_mask = (1.0 - attention_mask[None, :, None, :]) * -10000.0 extended_attention_mask = extended_attention_mask.expand((self.ngram, batch_size, seq_length, seq_length)) # predicted stream attention_mask should always be 0 extended_attention_mask = torch.cat( [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1 ) extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask else: extended_predict_attention_mask = extended_predict_causal_mask return extended_predict_attention_mask.repeat(1, self.config.num_decoder_attention_heads, 1, 1).to( hidden_states.dtype )
[docs]@add_start_docstrings( "The bare ProphetNet Model outputting raw hidden-states without any specific head on top.", PROPHETNET_START_DOCSTRING, ) class ProphetNetModel(ProphetNetPreTrainedModel): def __init__(self, config): super().__init__(config) self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) encoder_config = copy.deepcopy(config) encoder_config.is_encoder_decoder = False encoder_config.use_cache = False self.encoder = ProphetNetEncoder(encoder_config, self.word_embeddings) decoder_config = copy.deepcopy(config) decoder_config.is_decoder = True decoder_config.is_encoder_decoder = False self.decoder = ProphetNetDecoder(decoder_config, self.word_embeddings) self.init_weights() def get_input_embeddings(self): return self.word_embeddings def set_input_embeddings(self, value): self.word_embeddings = value self.encoder.word_embeddings = self.word_embeddings self.decoder.word_embeddings = self.word_embeddings def get_encoder(self): return self.encoder def get_decoder(self): return self.decoder
[docs] @add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, encoder_outputs: Optional[Tuple] = None, past_key_values=None, inputs_embeds=None, decoder_inputs_embeds=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" Returns: Example:: >>> from transformers import ProphetNetTokenizer, ProphetNetModel >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetModel.from_pretrained('microsoft/prophetnet-large-uncased') >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> last_hidden_states = outputs.last_hidden_state # main stream hidden states >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram # predict hidden states """ use_cache == use_cache if use_cache is not None else self.config.use_cache output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict if encoder_outputs is None: encoder_outputs = self.encoder( input_ids=input_ids, attention_mask=attention_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) decoder_outputs = self.decoder( input_ids=decoder_input_ids, attention_mask=decoder_attention_mask, encoder_hidden_states=encoder_outputs[0], encoder_attention_mask=attention_mask, past_key_values=past_key_values, inputs_embeds=decoder_inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states, use_cache=use_cache, return_dict=return_dict, ) if not return_dict: return decoder_outputs + encoder_outputs return ProphetNetSeq2SeqModelOutput( last_hidden_state=decoder_outputs.last_hidden_state, last_hidden_state_ngram=decoder_outputs.last_hidden_state_ngram, past_key_values=decoder_outputs.past_key_values, decoder_hidden_states=decoder_outputs.hidden_states, decoder_ngram_hidden_states=decoder_outputs.hidden_states_ngram, decoder_attentions=decoder_outputs.attentions, decoder_ngram_attentions=decoder_outputs.ngram_attentions, cross_attentions=decoder_outputs.cross_attentions, encoder_last_hidden_state=encoder_outputs.last_hidden_state, encoder_hidden_states=encoder_outputs.hidden_states, encoder_attentions=encoder_outputs.attentions, )
[docs]@add_start_docstrings( "The ProphetNet Model with a language modeling head. Can be used for sequence generation tasks.", PROPHETNET_START_DOCSTRING, ) class ProphetNetForConditionalGeneration(ProphetNetPreTrainedModel): def __init__(self, config: ProphetNetConfig): super().__init__(config) self.prophetnet = ProphetNetModel(config) self.padding_idx = config.pad_token_id self.disable_ngram_loss = config.disable_ngram_loss self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.init_weights() def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def get_input_embeddings(self): return self.prophetnet.word_embeddings
[docs] @add_start_docstrings_to_model_forward(PROPHETNET_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, encoder_outputs=None, past_key_values=None, inputs_embeds=None, decoder_inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[-100, 0, ..., config.vocab_size - 1]`. All labels set to ``-100`` are ignored (masked), the loss is only computed for labels in ``[0, ..., config.vocab_size]`` Returns: Example:: >>> from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetForConditionalGeneration.from_pretrained('microsoft/prophetnet-large-uncased') >>> input_ids = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt").input_ids # Batch size 1 >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids) >>> logits_next_token = outputs.logits # logits to predict next token as usual >>> logits_ngram_next_tokens = outputs.logits_ngram # logits to predict 2nd, 3rd, ... next tokens """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None: # get decoder inputs from shifting lm labels to the right decoder_input_ids = self._shift_right(labels) outputs = self.prophetnet( input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, decoder_attention_mask=decoder_attention_mask, encoder_outputs=encoder_outputs, past_key_values=past_key_values, inputs_embeds=inputs_embeds, decoder_inputs_embeds=decoder_inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) batch_size, sequence_length = ( decoder_input_ids.shape if decoder_input_ids is not None else decoder_inputs_embeds.shape[:2] ) predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1) predict_logits = self.lm_head(predicting_streams) logits = predict_logits[:, 0] logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None # To use .view in loss computation, make sure that logits is contiguous. if not logits.is_contiguous(): logits = logits.contiguous() loss = None if labels is not None: loss = self._compute_loss(predict_logits, labels) if not return_dict: all_logits = tuple(v for v in [logits, logits_ngram] if v is not None) return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:] else: return ProphetNetSeq2SeqLMOutput( loss=loss, logits=logits, logits_ngram=logits_ngram, past_key_values=outputs.past_key_values, decoder_hidden_states=outputs.decoder_hidden_states, decoder_ngram_hidden_states=outputs.decoder_ngram_hidden_states, decoder_attentions=outputs.decoder_attentions, decoder_ngram_attentions=outputs.decoder_ngram_attentions, cross_attentions=outputs.cross_attentions, encoder_last_hidden_state=outputs.encoder_last_hidden_state, encoder_hidden_states=outputs.encoder_hidden_states, encoder_attentions=outputs.encoder_attentions, )
def _compute_loss(self, logits, labels, ignore_index=-100): expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index) for i in range(self.config.ngram): if i > 0 and self.disable_ngram_loss: break expend_targets[i, :, :] = labels lprobs = F.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ) loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") if self.config.eps > 0.0: smooth_loss = -lprobs.sum(dim=-1, keepdim=True) non_masked_tokens = expend_targets.ne(ignore_index).view(-1) smooth_loss = smooth_loss[non_masked_tokens] smooth_loss = smooth_loss.mean() eps_i = self.config.eps / lprobs.size(-1) loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss return loss def prepare_inputs_for_generation( self, decoder_input_ids, past=None, attention_mask=None, use_cache=None, encoder_outputs=None, **kwargs ): assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation." if past: decoder_input_ids = decoder_input_ids[:, -1:] # first step, decoder_cached_states are empty return { "input_ids": None, # encoder_outputs is defined. input_ids not needed "encoder_outputs": encoder_outputs, "past_key_values": past, "decoder_input_ids": decoder_input_ids, "attention_mask": attention_mask, "use_cache": use_cache, } @staticmethod def _reorder_cache(past, beam_idx): # this function reorders the cache for beam search def _reorder_cache(cache_dict, beam_idx): for k, key_value_states in cache_dict.items(): if key_value_states is not None: cache_dict[k] = key_value_states.index_select(0, beam_idx) return cache_dict reordered_past = [] for layer_past in past: # get the correct batch idx from decoder layer's batch dim for cross and self-attn layer_past_new = { attn_key: _reorder_cache(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() } reordered_past.append(layer_past_new) return reordered_past def get_encoder(self): return self.prophetnet.encoder def get_decoder(self): return self.prophetnet.decoder
[docs]@add_start_docstrings( "The standalone decoder part of the ProphetNetModel with a lm head on top. The model can be used for causal language modeling.", PROPHETNET_START_DOCSTRING, ) class ProphetNetForCausalLM(ProphetNetPreTrainedModel): def __init__(self, config): super().__init__(config) # set config for CLM config = copy.deepcopy(config) config.is_decoder = True config.is_encoder_decoder = False self.prophetnet = ProphetNetDecoderWrapper(config) self.padding_idx = config.pad_token_id self.disable_ngram_loss = config.disable_ngram_loss self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) self.init_weights() def get_input_embeddings(self): return self.prophetnet.decoder.word_embeddings def set_input_embeddings(self, value): self.prophetnet.decoder.word_embeddings = value def get_output_embeddings(self): return self.lm_head def set_output_embeddings(self, new_embeddings): self.lm_head = new_embeddings def set_decoder(self, decoder): self.prophetnet.decoder = decoder def get_decoder(self): return self.prophetnet.decoder
[docs] @add_start_docstrings_to_model_forward(PROPHETNET_STANDALONE_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=ProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids=None, attention_mask=None, encoder_hidden_states=None, encoder_attention_mask=None, past_key_values=None, inputs_embeds=None, labels=None, use_cache=None, output_attentions=None, output_hidden_states=None, return_dict=None, ): r""" encoder_hidden_states (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`, `optional`): Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is configured as a decoder. encoder_attention_mask (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``: past_key_values (:obj:`tuple(tuple(torch.FloatTensor))` of length :obj:`config.n_layers` with each tuple having 4 tensors of shape :obj:`(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding. If :obj:`past_key_values` are used, the user can optionally input only the last ``decoder_input_ids`` (those that don't have their past key value states given to this model) of shape :obj:`(batch_size, 1)` instead of all ``decoder_input_ids`` of shape :obj:`(batch_size, sequence_length)`. use_cache (:obj:`bool`, `optional`): If set to :obj:`True`, :obj:`past_key_values` key value states are returned and can be used to speed up decoding (see :obj:`past_key_values`). - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`): Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels n ``[0, ..., config.vocab_size]`` Returns: Example:: >>> from transformers import ProphetNetTokenizer, ProphetNetForCausalLM >>> import torch >>> tokenizer = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = ProphetNetForCausalLM.from_pretrained('microsoft/prophetnet-large-uncased') >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") >>> outputs = model(**inputs) >>> logits = outputs.logits >>> # Model can also be used with EncoderDecoder framework >>> from transformers import BertTokenizer, EncoderDecoderModel, ProphetNetTokenizer >>> import torch >>> tokenizer_enc = BertTokenizer.from_pretrained('bert-large-uncased') >>> tokenizer_dec = ProphetNetTokenizer.from_pretrained('microsoft/prophetnet-large-uncased') >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-large-uncased", "microsoft/prophetnet-large-uncased") >>> ARTICLE = ( ... "the us state department said wednesday it had received no " ... "formal word from bolivia that it was expelling the us ambassador there " ... "but said the charges made against him are `` baseless ." ... ) >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids >>> labels = tokenizer_dec("us rejects charges against its ambassador in bolivia", return_tensors="pt").input_ids >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:]) >>> loss = outputs.loss """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) outputs = self.prophetnet.decoder( input_ids=input_ids, attention_mask=attention_mask, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, inputs_embeds=inputs_embeds, use_cache=use_cache, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) batch_size, sequence_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2] predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1) predict_logits = self.lm_head(predicting_streams) logits = predict_logits[:, 0] logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None loss = None if labels is not None: loss = self._compute_loss(predict_logits, labels) if not return_dict: all_logits = tuple(v for v in [logits, logits_ngram] if v is not None) return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:] else: return ProphetNetDecoderLMOutput( loss=loss, logits=logits, logits_ngram=logits_ngram, past_key_values=outputs.past_key_values, hidden_states=outputs.hidden_states, hidden_states_ngram=outputs.hidden_states_ngram, attentions=outputs.attentions, ngram_attentions=outputs.ngram_attentions, cross_attentions=outputs.cross_attentions, )
def _compute_loss(self, logits, labels, ignore_index=-100): expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index) for i in range(self.config.ngram): if i > 0 and self.disable_ngram_loss: break expend_targets[i, :, :] = labels lprobs = F.log_softmax( logits.view(-1, logits.size(-1)), dim=-1, dtype=torch.float32, ) loss = F.nll_loss(lprobs, expend_targets.view(-1), reduction="mean") if self.config.eps > 0.0: smooth_loss = -lprobs.sum(dim=-1, keepdim=True) non_masked_tokens = expend_targets.ne(ignore_index).view(-1) smooth_loss = smooth_loss[non_masked_tokens] smooth_loss = smooth_loss.mean() eps_i = self.config.eps / lprobs.size(-1) loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss return loss def prepare_inputs_for_generation(self, input_ids, past=None, attention_mask=None, use_cache=None, **kwargs): # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly if attention_mask is None: attention_mask = input_ids.new_ones(input_ids.shape) if past: input_ids = input_ids[:, -1:] # first step, decoder_cached_states are empty return { "input_ids": input_ids, # encoder_outputs is defined. input_ids not needed "attention_mask": attention_mask, "past_key_values": past, "use_cache": use_cache, } @staticmethod def _reorder_cache(past, beam_idx): # this function reorders the cache for beam search def _reorder_cache(cache_dict, beam_idx): for k, key_value_states in cache_dict.items(): if key_value_states is not None: cache_dict[k] = key_value_states.index_select(0, beam_idx) return cache_dict reordered_past = [] for layer_past in past: # get the correct batch idx from decoder layer's batch dim for cross and self-attn layer_past_new = { attn_key: _reorder_cache(attn_cache, beam_idx) for attn_key, attn_cache in layer_past.items() } reordered_past.append(layer_past_new) return reordered_past
class ProphetNetDecoderWrapper(ProphetNetPreTrainedModel): """ This is a wrapper class, so that :class:`~transformers.ProphetNetForCausalLM` can correctly be loaded from pretrained prophetnet classes. """ def __init__(self, config): super().__init__(config) self.decoder = ProphetNetDecoder(config) def forward(self, *args, **kwargs): return self.decoder(*args, **kwargs)