tuan.ljn commited on
Commit
c16d438
1 Parent(s): d601671

update config

Browse files
Files changed (2) hide show
  1. config.py +150 -0
  2. modeling.py +1567 -0
config.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
3
+ # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+ """Bilingual configuration"""
17
+
18
+ from collections import OrderedDict
19
+ from typing import Mapping
20
+
21
+ from transformers.configuration_utils import PretrainedConfig
22
+ from transformers.onnx import OnnxConfig
23
+ from transformers.utils import logging
24
+
25
+
26
+ logger = logging.get_logger(__name__)
27
+
28
+
29
+ class BilingualConfig(PretrainedConfig):
30
+ r"""
31
+ This is the configuration class to store the configuration of a [`BilingualModel`] or a [`TFBilingualModel`]. It
32
+ is used to instantiate a Bilingual model according to the specified arguments, defining the model architecture.
33
+ Instantiating a configuration with the defaults will yield a similar configuration to that of the Bilingual
34
+
35
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
36
+ documentation from [`PretrainedConfig`] for more information.
37
+
38
+
39
+ Args:
40
+ vocab_size (`int`, *optional*, defaults to 30522):
41
+ Vocabulary size of the Bilingual model. Defines the number of different tokens that can be represented by
42
+ the `inputs_ids` passed when calling [`BilingualModel`] or [`TFBilingualModel`].
43
+ hidden_size (`int`, *optional*, defaults to 768):
44
+ Dimensionality of the encoder layers and the pooler layer.
45
+ num_hidden_layers (`int`, *optional*, defaults to 12):
46
+ Number of hidden layers in the Transformer encoder.
47
+ num_attention_heads (`int`, *optional*, defaults to 12):
48
+ Number of attention heads for each attention layer in the Transformer encoder.
49
+ intermediate_size (`int`, *optional*, defaults to 3072):
50
+ Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
51
+ hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
52
+ The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
53
+ `"relu"`, `"silu"` and `"gelu_new"` are supported.
54
+ hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
55
+ The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
56
+ attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
57
+ The dropout ratio for the attention probabilities.
58
+ max_position_embeddings (`int`, *optional*, defaults to 512):
59
+ The maximum sequence length that this model might ever be used with. Typically set this to something large
60
+ just in case (e.g., 512 or 1024 or 2048).
61
+ type_vocab_size (`int`, *optional*, defaults to 2):
62
+ The vocabulary size of the `token_type_ids` passed when calling [`BilingualModel`] or
63
+ [`TFBilingualModel`].
64
+ initializer_range (`float`, *optional*, defaults to 0.02):
65
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
66
+ layer_norm_eps (`float`, *optional*, defaults to 1e-12):
67
+ The epsilon used by the layer normalization layers.
68
+ position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
69
+ Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
70
+ positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
71
+ [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
72
+ For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
73
+ with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
74
+ is_decoder (`bool`, *optional*, defaults to `False`):
75
+ Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
76
+ use_cache (`bool`, *optional*, defaults to `True`):
77
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
78
+ relevant if `config.is_decoder=True`.
79
+ classifier_dropout (`float`, *optional*):
80
+ The dropout ratio for the classification head.
81
+
82
+ Examples:
83
+
84
+ ```python
85
+ >>> from transformers import BilingualConfig, BilingualModel
86
+
87
+ >>> configuration = BilingualConfig()
88
+
89
+ >>> model = BilingualModel(configuration)
90
+
91
+ >>> # Accessing the model configuration
92
+ >>> configuration = model.config
93
+ ```"""
94
+
95
+ model_type = "bilingual"
96
+
97
+ def __init__(
98
+ self,
99
+ vocab_size=30522,
100
+ hidden_size=768,
101
+ num_hidden_layers=12,
102
+ num_attention_heads=12,
103
+ intermediate_size=3072,
104
+ hidden_act="gelu",
105
+ hidden_dropout_prob=0.1,
106
+ attention_probs_dropout_prob=0.1,
107
+ max_position_embeddings=512,
108
+ type_vocab_size=2,
109
+ initializer_range=0.02,
110
+ layer_norm_eps=1e-12,
111
+ pad_token_id=1,
112
+ bos_token_id=0,
113
+ eos_token_id=2,
114
+ position_embedding_type="absolute",
115
+ use_cache=True,
116
+ classifier_dropout=None,
117
+ **kwargs,
118
+ ):
119
+ super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
120
+
121
+ self.vocab_size = vocab_size
122
+ self.hidden_size = hidden_size
123
+ self.num_hidden_layers = num_hidden_layers
124
+ self.num_attention_heads = num_attention_heads
125
+ self.hidden_act = hidden_act
126
+ self.intermediate_size = intermediate_size
127
+ self.hidden_dropout_prob = hidden_dropout_prob
128
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
129
+ self.max_position_embeddings = max_position_embeddings
130
+ self.type_vocab_size = type_vocab_size
131
+ self.initializer_range = initializer_range
132
+ self.layer_norm_eps = layer_norm_eps
133
+ self.position_embedding_type = position_embedding_type
134
+ self.use_cache = use_cache
135
+ self.classifier_dropout = classifier_dropout
136
+
137
+
138
+ class BilingualOnnxConfig(OnnxConfig):
139
+ @property
140
+ def inputs(self) -> Mapping[str, Mapping[int, str]]:
141
+ if self.task == "multiple-choice":
142
+ dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
143
+ else:
144
+ dynamic_axis = {0: "batch", 1: "sequence"}
145
+ return OrderedDict(
146
+ [
147
+ ("input_ids", dynamic_axis),
148
+ ("attention_mask", dynamic_axis),
149
+ ]
150
+ )
modeling.py ADDED
@@ -0,0 +1,1567 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ from typing import List, Optional, Tuple, Union
3
+
4
+ import torch
5
+ import torch.utils.checkpoint
6
+ from torch import nn
7
+ from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
8
+
9
+ from transformers.activations import ACT2FN, gelu
10
+ from transformers.modeling_outputs import (
11
+ BaseModelOutputWithPastAndCrossAttentions,
12
+ BaseModelOutputWithPoolingAndCrossAttentions,
13
+ CausalLMOutputWithCrossAttentions,
14
+ MaskedLMOutput,
15
+ MultipleChoiceModelOutput,
16
+ NextSentencePredictorOutput,
17
+ QuestionAnsweringModelOutput,
18
+ SequenceClassifierOutput,
19
+ TokenClassifierOutput,
20
+ )
21
+ from transformers.pytorch_utils import (
22
+ apply_chunking_to_forward,
23
+ find_pruneable_heads_and_indices,
24
+ prune_linear_layer,
25
+ )
26
+
27
+ from transformers.modeling_utils import PreTrainedModel
28
+ from transformers.utils import (
29
+ ModelOutput,
30
+ add_code_sample_docstrings,
31
+ add_start_docstrings,
32
+ add_start_docstrings_to_model_forward,
33
+ logging,
34
+ replace_return_docstrings,
35
+ )
36
+
37
+ from .config import BilingualConfig
38
+
39
+
40
+ logger = logging.get_logger(__name__)
41
+
42
+ _CHECKPOINT_FOR_DOC = "FacebookAI/xlm-roberta-base"
43
+ _CONFIG_FOR_DOC = "BilingualConfig"
44
+
45
+
46
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Bilingual
47
+ class BilingualEmbeddings(nn.Module):
48
+ """
49
+ Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
50
+ """
51
+
52
+ # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
53
+ def __init__(self, config):
54
+ super().__init__()
55
+ self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
56
+ self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
57
+ self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
58
+
59
+ # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
60
+ # any TensorFlow checkpoint file
61
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
62
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
63
+ # position_ids (1, len position emb) is contiguous in memory and exported when serialized
64
+ self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
65
+ self.register_buffer(
66
+ "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
67
+ )
68
+ self.register_buffer(
69
+ "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
70
+ )
71
+
72
+ # End copy
73
+ self.padding_idx = config.pad_token_id
74
+ self.position_embeddings = nn.Embedding(
75
+ config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx
76
+ )
77
+
78
+ def forward(
79
+ self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
80
+ ):
81
+ if position_ids is None:
82
+ if input_ids is not None:
83
+ # Create the position ids from the input token ids. Any padded tokens remain padded.
84
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
85
+ else:
86
+ position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
87
+
88
+ if input_ids is not None:
89
+ input_shape = input_ids.size()
90
+ else:
91
+ input_shape = inputs_embeds.size()[:-1]
92
+
93
+ seq_length = input_shape[1]
94
+
95
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
96
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
97
+ # issue #5664
98
+ if token_type_ids is None:
99
+ if hasattr(self, "token_type_ids"):
100
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
101
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
102
+ token_type_ids = buffered_token_type_ids_expanded
103
+ else:
104
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
105
+
106
+ if inputs_embeds is None:
107
+ inputs_embeds = self.word_embeddings(input_ids)
108
+ token_type_embeddings = self.token_type_embeddings(token_type_ids)
109
+
110
+ embeddings = inputs_embeds + token_type_embeddings
111
+ if self.position_embedding_type == "absolute":
112
+ position_embeddings = self.position_embeddings(position_ids)
113
+ embeddings += position_embeddings
114
+ embeddings = self.LayerNorm(embeddings)
115
+ embeddings = self.dropout(embeddings)
116
+ return embeddings
117
+
118
+ def create_position_ids_from_inputs_embeds(self, inputs_embeds):
119
+ """
120
+ We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
121
+
122
+ Args:
123
+ inputs_embeds: torch.Tensor
124
+
125
+ Returns: torch.Tensor
126
+ """
127
+ input_shape = inputs_embeds.size()[:-1]
128
+ sequence_length = input_shape[1]
129
+
130
+ position_ids = torch.arange(
131
+ self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
132
+ )
133
+ return position_ids.unsqueeze(0).expand(input_shape)
134
+
135
+
136
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Bilingual
137
+ class BilingualSelfAttention(nn.Module):
138
+ def __init__(self, config, position_embedding_type=None):
139
+ super().__init__()
140
+ if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
141
+ raise ValueError(
142
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
143
+ f"heads ({config.num_attention_heads})"
144
+ )
145
+
146
+ self.num_attention_heads = config.num_attention_heads
147
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
148
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
149
+
150
+ self.query = nn.Linear(config.hidden_size, self.all_head_size)
151
+ self.key = nn.Linear(config.hidden_size, self.all_head_size)
152
+ self.value = nn.Linear(config.hidden_size, self.all_head_size)
153
+
154
+ self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
155
+ self.position_embedding_type = position_embedding_type or getattr(
156
+ config, "position_embedding_type", "absolute"
157
+ )
158
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
159
+ self.max_position_embeddings = config.max_position_embeddings
160
+ self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
161
+
162
+ self.is_decoder = config.is_decoder
163
+
164
+ def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
165
+ new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
166
+ x = x.view(new_x_shape)
167
+ return x.permute(0, 2, 1, 3)
168
+
169
+ def forward(
170
+ self,
171
+ hidden_states: torch.Tensor,
172
+ attention_mask: Optional[torch.FloatTensor] = None,
173
+ head_mask: Optional[torch.FloatTensor] = None,
174
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
175
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
176
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
177
+ output_attentions: Optional[bool] = False,
178
+ ) -> Tuple[torch.Tensor]:
179
+ mixed_query_layer = self.query(hidden_states)
180
+
181
+ # If this is instantiated as a cross-attention module, the keys
182
+ # and values come from an encoder; the attention mask needs to be
183
+ # such that the encoder's padding tokens are not attended to.
184
+ is_cross_attention = encoder_hidden_states is not None
185
+
186
+ if is_cross_attention and past_key_value is not None:
187
+ # reuse k,v, cross_attentions
188
+ key_layer = past_key_value[0]
189
+ value_layer = past_key_value[1]
190
+ attention_mask = encoder_attention_mask
191
+ elif is_cross_attention:
192
+ key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
193
+ value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
194
+ attention_mask = encoder_attention_mask
195
+ elif past_key_value is not None:
196
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
197
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
198
+ key_layer = torch.cat([past_key_value[0], key_layer], dim=2)
199
+ value_layer = torch.cat([past_key_value[1], value_layer], dim=2)
200
+ else:
201
+ key_layer = self.transpose_for_scores(self.key(hidden_states))
202
+ value_layer = self.transpose_for_scores(self.value(hidden_states))
203
+
204
+ query_layer = self.transpose_for_scores(mixed_query_layer)
205
+
206
+ use_cache = past_key_value is not None
207
+ if self.is_decoder:
208
+ # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
209
+ # Further calls to cross_attention layer can then reuse all cross-attention
210
+ # key/value_states (first "if" case)
211
+ # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
212
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
213
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
214
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
215
+ past_key_value = (key_layer, value_layer)
216
+
217
+ # Take the dot product between "query" and "key" to get the raw attention scores.
218
+ attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
219
+
220
+ if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
221
+ query_length, key_length = query_layer.shape[2], key_layer.shape[2]
222
+ if use_cache:
223
+ position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
224
+ -1, 1
225
+ )
226
+ else:
227
+ position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
228
+ position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
229
+ distance = position_ids_l - position_ids_r
230
+
231
+ positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
232
+ positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility
233
+
234
+ if self.position_embedding_type == "relative_key":
235
+ relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
236
+ attention_scores = attention_scores + relative_position_scores
237
+ elif self.position_embedding_type == "relative_key_query":
238
+ relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
239
+ relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
240
+ attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
241
+
242
+ attention_scores = attention_scores / math.sqrt(self.attention_head_size)
243
+ if attention_mask is not None:
244
+ # Apply the attention mask is (precomputed for all layers in BilingualModel forward() function)
245
+ attention_scores = attention_scores + attention_mask
246
+
247
+ # Normalize the attention scores to probabilities.
248
+ attention_probs = nn.functional.softmax(attention_scores, dim=-1)
249
+
250
+ # This is actually dropping out entire tokens to attend to, which might
251
+ # seem a bit unusual, but is taken from the original Transformer paper.
252
+ attention_probs = self.dropout(attention_probs)
253
+
254
+ # Mask heads if we want to
255
+ if head_mask is not None:
256
+ attention_probs = attention_probs * head_mask
257
+
258
+ context_layer = torch.matmul(attention_probs, value_layer)
259
+
260
+ context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
261
+ new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
262
+ context_layer = context_layer.view(new_context_layer_shape)
263
+
264
+ outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
265
+
266
+ if self.is_decoder:
267
+ outputs = outputs + (past_key_value,)
268
+ return outputs
269
+
270
+
271
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Bilingual
272
+ class BilingualSelfOutput(nn.Module):
273
+ def __init__(self, config):
274
+ super().__init__()
275
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
276
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
277
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
278
+
279
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
280
+ hidden_states = self.dense(hidden_states)
281
+ hidden_states = self.dropout(hidden_states)
282
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
283
+ return hidden_states
284
+
285
+
286
+ BILINGUAL_SELF_ATTENTION_CLASSES = {
287
+ "eager": BilingualSelfAttention,
288
+ }
289
+
290
+
291
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Bilingual,ROBERTA->BILINGUAL
292
+ class BilingualAttention(nn.Module):
293
+ def __init__(self, config, position_embedding_type=None):
294
+ super().__init__()
295
+ self.self = BILINGUAL_SELF_ATTENTION_CLASSES[config._attn_implementation](
296
+ config, position_embedding_type=position_embedding_type
297
+ )
298
+ self.output = BilingualSelfOutput(config)
299
+ self.pruned_heads = set()
300
+
301
+ def prune_heads(self, heads):
302
+ if len(heads) == 0:
303
+ return
304
+ heads, index = find_pruneable_heads_and_indices(
305
+ heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
306
+ )
307
+
308
+ # Prune linear layers
309
+ self.self.query = prune_linear_layer(self.self.query, index)
310
+ self.self.key = prune_linear_layer(self.self.key, index)
311
+ self.self.value = prune_linear_layer(self.self.value, index)
312
+ self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
313
+
314
+ # Update hyper params and store pruned heads
315
+ self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
316
+ self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
317
+ self.pruned_heads = self.pruned_heads.union(heads)
318
+
319
+ def forward(
320
+ self,
321
+ hidden_states: torch.Tensor,
322
+ attention_mask: Optional[torch.FloatTensor] = None,
323
+ head_mask: Optional[torch.FloatTensor] = None,
324
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
325
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
326
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
327
+ output_attentions: Optional[bool] = False,
328
+ ) -> Tuple[torch.Tensor]:
329
+ self_outputs = self.self(
330
+ hidden_states,
331
+ attention_mask,
332
+ head_mask,
333
+ encoder_hidden_states,
334
+ encoder_attention_mask,
335
+ past_key_value,
336
+ output_attentions,
337
+ )
338
+ attention_output = self.output(self_outputs[0], hidden_states)
339
+ outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them
340
+ return outputs
341
+
342
+
343
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->Bilingual
344
+ class BilingualIntermediate(nn.Module):
345
+ def __init__(self, config):
346
+ super().__init__()
347
+ self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
348
+ if isinstance(config.hidden_act, str):
349
+ self.intermediate_act_fn = ACT2FN[config.hidden_act]
350
+ else:
351
+ self.intermediate_act_fn = config.hidden_act
352
+
353
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
354
+ hidden_states = self.dense(hidden_states)
355
+ hidden_states = self.intermediate_act_fn(hidden_states)
356
+ return hidden_states
357
+
358
+
359
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaOutput with Roberta->Bilingual
360
+ class BilingualOutput(nn.Module):
361
+ def __init__(self, config):
362
+ super().__init__()
363
+ self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
364
+ self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
365
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
366
+
367
+ def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
368
+ hidden_states = self.dense(hidden_states)
369
+ hidden_states = self.dropout(hidden_states)
370
+ hidden_states = self.LayerNorm(hidden_states + input_tensor)
371
+ return hidden_states
372
+
373
+
374
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Bilingual
375
+ class BilingualLayer(nn.Module):
376
+ def __init__(self, config):
377
+ super().__init__()
378
+ self.chunk_size_feed_forward = config.chunk_size_feed_forward
379
+ self.seq_len_dim = 1
380
+ self.attention = BilingualAttention(config)
381
+ self.is_decoder = config.is_decoder
382
+ self.add_cross_attention = config.add_cross_attention
383
+ if self.add_cross_attention:
384
+ if not self.is_decoder:
385
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
386
+ self.crossattention = BilingualAttention(config, position_embedding_type="absolute")
387
+ self.intermediate = BilingualIntermediate(config)
388
+ self.output = BilingualOutput(config)
389
+
390
+ def forward(
391
+ self,
392
+ hidden_states: torch.Tensor,
393
+ attention_mask: Optional[torch.FloatTensor] = None,
394
+ head_mask: Optional[torch.FloatTensor] = None,
395
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
396
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
397
+ past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
398
+ output_attentions: Optional[bool] = False,
399
+ ) -> Tuple[torch.Tensor]:
400
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
401
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
402
+ self_attention_outputs = self.attention(
403
+ hidden_states,
404
+ attention_mask,
405
+ head_mask,
406
+ output_attentions=output_attentions,
407
+ past_key_value=self_attn_past_key_value,
408
+ )
409
+ attention_output = self_attention_outputs[0]
410
+
411
+ # if decoder, the last output is tuple of self-attn cache
412
+ if self.is_decoder:
413
+ outputs = self_attention_outputs[1:-1]
414
+ present_key_value = self_attention_outputs[-1]
415
+ else:
416
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
417
+
418
+ cross_attn_present_key_value = None
419
+ if self.is_decoder and encoder_hidden_states is not None:
420
+ if not hasattr(self, "crossattention"):
421
+ raise ValueError(
422
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
423
+ " by setting `config.add_cross_attention=True`"
424
+ )
425
+
426
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
427
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
428
+ cross_attention_outputs = self.crossattention(
429
+ attention_output,
430
+ attention_mask,
431
+ head_mask,
432
+ encoder_hidden_states,
433
+ encoder_attention_mask,
434
+ cross_attn_past_key_value,
435
+ output_attentions,
436
+ )
437
+ attention_output = cross_attention_outputs[0]
438
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
439
+
440
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
441
+ cross_attn_present_key_value = cross_attention_outputs[-1]
442
+ present_key_value = present_key_value + cross_attn_present_key_value
443
+
444
+ layer_output = apply_chunking_to_forward(
445
+ self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
446
+ )
447
+ outputs = (layer_output,) + outputs
448
+
449
+ # if decoder, return the attn key/values as the last output
450
+ if self.is_decoder:
451
+ outputs = outputs + (present_key_value,)
452
+
453
+ return outputs
454
+
455
+ def feed_forward_chunk(self, attention_output):
456
+ intermediate_output = self.intermediate(attention_output)
457
+ layer_output = self.output(intermediate_output, attention_output)
458
+ return layer_output
459
+
460
+
461
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->Bilingual
462
+ class BilingualEncoder(nn.Module):
463
+ def __init__(self, config):
464
+ super().__init__()
465
+ self.config = config
466
+ self.layer = nn.ModuleList([BilingualLayer(config) for _ in range(config.num_hidden_layers)])
467
+ self.gradient_checkpointing = False
468
+
469
+ def forward(
470
+ self,
471
+ hidden_states: torch.Tensor,
472
+ attention_mask: Optional[torch.FloatTensor] = None,
473
+ head_mask: Optional[torch.FloatTensor] = None,
474
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
475
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
476
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
477
+ use_cache: Optional[bool] = None,
478
+ output_attentions: Optional[bool] = False,
479
+ output_hidden_states: Optional[bool] = False,
480
+ return_dict: Optional[bool] = True,
481
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
482
+ all_hidden_states = () if output_hidden_states else None
483
+ all_self_attentions = () if output_attentions else None
484
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
485
+
486
+ if self.gradient_checkpointing and self.training:
487
+ if use_cache:
488
+ logger.warning_once(
489
+ "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
490
+ )
491
+ use_cache = False
492
+
493
+ next_decoder_cache = () if use_cache else None
494
+ for i, layer_module in enumerate(self.layer):
495
+ if output_hidden_states:
496
+ all_hidden_states = all_hidden_states + (hidden_states,)
497
+
498
+ layer_head_mask = head_mask[i] if head_mask is not None else None
499
+ past_key_value = past_key_values[i] if past_key_values is not None else None
500
+
501
+ if self.gradient_checkpointing and self.training:
502
+ layer_outputs = self._gradient_checkpointing_func(
503
+ layer_module.__call__,
504
+ hidden_states,
505
+ attention_mask,
506
+ layer_head_mask,
507
+ encoder_hidden_states,
508
+ encoder_attention_mask,
509
+ past_key_value,
510
+ output_attentions,
511
+ )
512
+ else:
513
+ layer_outputs = layer_module(
514
+ hidden_states,
515
+ attention_mask,
516
+ layer_head_mask,
517
+ encoder_hidden_states,
518
+ encoder_attention_mask,
519
+ past_key_value,
520
+ output_attentions,
521
+ )
522
+
523
+ hidden_states = layer_outputs[0]
524
+ if use_cache:
525
+ next_decoder_cache += (layer_outputs[-1],)
526
+ if output_attentions:
527
+ all_self_attentions = all_self_attentions + (layer_outputs[1],)
528
+ if self.config.add_cross_attention:
529
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
530
+
531
+ if output_hidden_states:
532
+ all_hidden_states = all_hidden_states + (hidden_states,)
533
+
534
+ if not return_dict:
535
+ return tuple(
536
+ v
537
+ for v in [
538
+ hidden_states,
539
+ next_decoder_cache,
540
+ all_hidden_states,
541
+ all_self_attentions,
542
+ all_cross_attentions,
543
+ ]
544
+ if v is not None
545
+ )
546
+ return BaseModelOutputWithPastAndCrossAttentions(
547
+ last_hidden_state=hidden_states,
548
+ past_key_values=next_decoder_cache,
549
+ hidden_states=all_hidden_states,
550
+ attentions=all_self_attentions,
551
+ cross_attentions=all_cross_attentions,
552
+ )
553
+
554
+
555
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->Bilingual
556
+ class BilingualPooler(nn.Module):
557
+ def __init__(self, config):
558
+ super().__init__()
559
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
560
+ self.activation = nn.Tanh()
561
+
562
+ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
563
+ # We "pool" the model by simply taking the hidden state corresponding
564
+ # to the first token.
565
+ first_token_tensor = hidden_states[:, 0]
566
+ pooled_output = self.dense(first_token_tensor)
567
+ pooled_output = self.activation(pooled_output)
568
+ return pooled_output
569
+
570
+
571
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaPreTrainedModel with Roberta->Bilingual
572
+ class BilingualPreTrainedModel(PreTrainedModel):
573
+ """
574
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
575
+ models.
576
+ """
577
+
578
+ config_class = BilingualConfig
579
+ base_model_prefix = "roberta"
580
+ supports_gradient_checkpointing = True
581
+ _no_split_modules = ["BilingualEmbeddings", "BilingualSelfAttention"]
582
+
583
+ # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights
584
+ def _init_weights(self, module):
585
+ """Initialize the weights"""
586
+ if isinstance(module, nn.Linear):
587
+ # Slightly different from the TF version which uses truncated_normal for initialization
588
+ # cf https://github.com/pytorch/pytorch/pull/5617
589
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
590
+ if module.bias is not None:
591
+ module.bias.data.zero_()
592
+ elif isinstance(module, nn.Embedding):
593
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
594
+ if module.padding_idx is not None:
595
+ module.weight.data[module.padding_idx].zero_()
596
+ elif isinstance(module, nn.LayerNorm):
597
+ module.bias.data.zero_()
598
+ module.weight.data.fill_(1.0)
599
+
600
+
601
+ BILINGUAL_START_DOCSTRING = r"""
602
+
603
+ This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
604
+ library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
605
+ etc.)
606
+
607
+ This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
608
+ Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
609
+ and behavior.
610
+
611
+ Parameters:
612
+ config ([`BilingualConfig`]): Model configuration class with all the parameters of the
613
+ model. Initializing with a config file does not load the weights associated with the model, only the
614
+ configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
615
+ """
616
+
617
+ BILINGUAL_INPUTS_DOCSTRING = r"""
618
+ Args:
619
+ input_ids (`torch.LongTensor` of shape `({0})`):
620
+ Indices of input sequence tokens in the vocabulary.
621
+
622
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
623
+ [`PreTrainedTokenizer.__call__`] for details.
624
+
625
+ [What are input IDs?](../glossary#input-ids)
626
+ attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
627
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
628
+
629
+ - 1 for tokens that are **not masked**,
630
+ - 0 for tokens that are **masked**.
631
+
632
+ [What are attention masks?](../glossary#attention-mask)
633
+ token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
634
+ Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
635
+ 1]`:
636
+
637
+ - 0 corresponds to a *sentence A* token,
638
+ - 1 corresponds to a *sentence B* token.
639
+
640
+ [What are token type IDs?](../glossary#token-type-ids)
641
+ position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
642
+ Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
643
+ config.max_position_embeddings - 1]`.
644
+
645
+ [What are position IDs?](../glossary#position-ids)
646
+ head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
647
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
648
+
649
+ - 1 indicates the head is **not masked**,
650
+ - 0 indicates the head is **masked**.
651
+
652
+ inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
653
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
654
+ is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
655
+ model's internal embedding lookup matrix.
656
+ output_attentions (`bool`, *optional*):
657
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
658
+ tensors for more detail.
659
+ output_hidden_states (`bool`, *optional*):
660
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
661
+ more detail.
662
+ return_dict (`bool`, *optional*):
663
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
664
+ """
665
+
666
+
667
+ @add_start_docstrings(
668
+ "The bare Bilingual Model transformer outputting raw hidden-states without any specific head on top.",
669
+ BILINGUAL_START_DOCSTRING,
670
+ )
671
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaModel with Roberta->Bilingual, ROBERTA->BILINGUAL
672
+ class BilingualModel(BilingualPreTrainedModel):
673
+ """
674
+
675
+ The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
676
+ cross-attention is added between the self-attention layers, following the architecture described in *Attention is
677
+ all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
678
+ Kaiser and Illia Polosukhin.
679
+
680
+ To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
681
+ to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
682
+ `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
683
+
684
+ .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
685
+
686
+ """
687
+
688
+ # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->Bilingual
689
+ def __init__(self, config, add_pooling_layer=True):
690
+ super().__init__(config)
691
+ self.config = config
692
+
693
+ self.embeddings = BilingualEmbeddings(config)
694
+ self.encoder = BilingualEncoder(config)
695
+
696
+ self.pooler = BilingualPooler(config) if add_pooling_layer else None
697
+
698
+ # Initialize weights and apply final processing
699
+ self.post_init()
700
+
701
+ def get_input_embeddings(self):
702
+ return self.embeddings.word_embeddings
703
+
704
+ def set_input_embeddings(self, value):
705
+ self.embeddings.word_embeddings = value
706
+
707
+ def _prune_heads(self, heads_to_prune):
708
+ """
709
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
710
+ class PreTrainedModel
711
+ """
712
+ for layer, heads in heads_to_prune.items():
713
+ self.encoder.layer[layer].attention.prune_heads(heads)
714
+
715
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
716
+ @add_code_sample_docstrings(
717
+ checkpoint=_CHECKPOINT_FOR_DOC,
718
+ output_type=BaseModelOutputWithPoolingAndCrossAttentions,
719
+ config_class=_CONFIG_FOR_DOC,
720
+ )
721
+ # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward
722
+ def forward(
723
+ self,
724
+ input_ids: Optional[torch.Tensor] = None,
725
+ attention_mask: Optional[torch.Tensor] = None,
726
+ token_type_ids: Optional[torch.Tensor] = None,
727
+ position_ids: Optional[torch.Tensor] = None,
728
+ head_mask: Optional[torch.Tensor] = None,
729
+ inputs_embeds: Optional[torch.Tensor] = None,
730
+ encoder_hidden_states: Optional[torch.Tensor] = None,
731
+ encoder_attention_mask: Optional[torch.Tensor] = None,
732
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
733
+ use_cache: Optional[bool] = None,
734
+ output_attentions: Optional[bool] = None,
735
+ output_hidden_states: Optional[bool] = None,
736
+ return_dict: Optional[bool] = None,
737
+ ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
738
+ r"""
739
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
740
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
741
+ the model is configured as a decoder.
742
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
743
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
744
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
745
+
746
+ - 1 for tokens that are **not masked**,
747
+ - 0 for tokens that are **masked**.
748
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
749
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
750
+
751
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
752
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
753
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
754
+ use_cache (`bool`, *optional*):
755
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
756
+ `past_key_values`).
757
+ """
758
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
759
+ output_hidden_states = (
760
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
761
+ )
762
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
763
+
764
+ if self.config.is_decoder:
765
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
766
+ else:
767
+ use_cache = False
768
+
769
+ if input_ids is not None and inputs_embeds is not None:
770
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
771
+ elif input_ids is not None:
772
+ self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
773
+ input_shape = input_ids.size()
774
+ elif inputs_embeds is not None:
775
+ input_shape = inputs_embeds.size()[:-1]
776
+ else:
777
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
778
+
779
+ batch_size, seq_length = input_shape
780
+ device = input_ids.device if input_ids is not None else inputs_embeds.device
781
+
782
+ # past_key_values_length
783
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
784
+
785
+ if attention_mask is None:
786
+ attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
787
+
788
+ if token_type_ids is None:
789
+ if hasattr(self.embeddings, "token_type_ids"):
790
+ buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
791
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
792
+ token_type_ids = buffered_token_type_ids_expanded
793
+ else:
794
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
795
+
796
+ # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
797
+ # ourselves in which case we just need to make it broadcastable to all heads.
798
+ extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
799
+
800
+ # If a 2D or 3D attention mask is provided for the cross-attention
801
+ # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
802
+ if self.config.is_decoder and encoder_hidden_states is not None:
803
+ encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
804
+ encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
805
+ if encoder_attention_mask is None:
806
+ encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
807
+ encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
808
+ else:
809
+ encoder_extended_attention_mask = None
810
+
811
+ # Prepare head mask if needed
812
+ # 1.0 in head_mask indicate we keep the head
813
+ # attention_probs has shape bsz x n_heads x N x N
814
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
815
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
816
+ head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
817
+
818
+ embedding_output = self.embeddings(
819
+ input_ids=input_ids,
820
+ position_ids=position_ids,
821
+ token_type_ids=token_type_ids,
822
+ inputs_embeds=inputs_embeds,
823
+ past_key_values_length=past_key_values_length,
824
+ )
825
+ encoder_outputs = self.encoder(
826
+ embedding_output,
827
+ attention_mask=extended_attention_mask,
828
+ head_mask=head_mask,
829
+ encoder_hidden_states=encoder_hidden_states,
830
+ encoder_attention_mask=encoder_extended_attention_mask,
831
+ past_key_values=past_key_values,
832
+ use_cache=use_cache,
833
+ output_attentions=output_attentions,
834
+ output_hidden_states=output_hidden_states,
835
+ return_dict=return_dict,
836
+ )
837
+ sequence_output = encoder_outputs[0]
838
+ pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
839
+
840
+ if not return_dict:
841
+ return (sequence_output, pooled_output) + encoder_outputs[1:]
842
+
843
+ return BaseModelOutputWithPoolingAndCrossAttentions(
844
+ last_hidden_state=sequence_output,
845
+ pooler_output=pooled_output,
846
+ past_key_values=encoder_outputs.past_key_values,
847
+ hidden_states=encoder_outputs.hidden_states,
848
+ attentions=encoder_outputs.attentions,
849
+ cross_attentions=encoder_outputs.cross_attentions,
850
+ )
851
+
852
+
853
+ @add_start_docstrings(
854
+ "Bilingual Model with a `language modeling` head on top for CLM fine-tuning.",
855
+ BILINGUAL_START_DOCSTRING,
856
+ )
857
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Bilingual, ROBERTA->BILINGUAL
858
+ class BilingualForCausalLM(BilingualPreTrainedModel):
859
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
860
+
861
+ def __init__(self, config):
862
+ super().__init__(config)
863
+
864
+ if not config.is_decoder:
865
+ logger.warning("If you want to use `BilingualLMHeadModel` as a standalone, add `is_decoder=True.`")
866
+
867
+ self.roberta = BilingualModel(config, add_pooling_layer=False)
868
+ self.lm_head = BilingualLMHead(config)
869
+
870
+ # Initialize weights and apply final processing
871
+ self.post_init()
872
+
873
+ def get_output_embeddings(self):
874
+ return self.lm_head.decoder
875
+
876
+ def set_output_embeddings(self, new_embeddings):
877
+ self.lm_head.decoder = new_embeddings
878
+
879
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
880
+ @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
881
+ def forward(
882
+ self,
883
+ input_ids: Optional[torch.LongTensor] = None,
884
+ attention_mask: Optional[torch.FloatTensor] = None,
885
+ token_type_ids: Optional[torch.LongTensor] = None,
886
+ position_ids: Optional[torch.LongTensor] = None,
887
+ head_mask: Optional[torch.FloatTensor] = None,
888
+ inputs_embeds: Optional[torch.FloatTensor] = None,
889
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
890
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
891
+ labels: Optional[torch.LongTensor] = None,
892
+ past_key_values: Tuple[Tuple[torch.FloatTensor]] = None,
893
+ use_cache: Optional[bool] = None,
894
+ output_attentions: Optional[bool] = None,
895
+ output_hidden_states: Optional[bool] = None,
896
+ return_dict: Optional[bool] = None,
897
+ ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
898
+ r"""
899
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
900
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
901
+ the model is configured as a decoder.
902
+ encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
903
+ Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
904
+ the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
905
+
906
+ - 1 for tokens that are **not masked**,
907
+ - 0 for tokens that are **masked**.
908
+
909
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
910
+ Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
911
+ `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
912
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
913
+ past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
914
+ Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
915
+
916
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
917
+ don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
918
+ `decoder_input_ids` of shape `(batch_size, sequence_length)`.
919
+ use_cache (`bool`, *optional*):
920
+ If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
921
+ `past_key_values`).
922
+
923
+ Returns:
924
+
925
+ Example:
926
+
927
+ ```python
928
+ >>> from transformers import AutoTokenizer, BilingualForCausalLM, AutoConfig
929
+ >>> import torch
930
+
931
+ >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base")
932
+ >>> config = AutoConfig.from_pretrained("FacebookAI/roberta-base")
933
+ >>> config.is_decoder = True
934
+ >>> model = BilingualForCausalLM.from_pretrained("FacebookAI/roberta-base", config=config)
935
+
936
+ >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
937
+ >>> outputs = model(**inputs)
938
+
939
+ >>> prediction_logits = outputs.logits
940
+ ```"""
941
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
942
+ if labels is not None:
943
+ use_cache = False
944
+
945
+ outputs = self.roberta(
946
+ input_ids,
947
+ attention_mask=attention_mask,
948
+ token_type_ids=token_type_ids,
949
+ position_ids=position_ids,
950
+ head_mask=head_mask,
951
+ inputs_embeds=inputs_embeds,
952
+ encoder_hidden_states=encoder_hidden_states,
953
+ encoder_attention_mask=encoder_attention_mask,
954
+ past_key_values=past_key_values,
955
+ use_cache=use_cache,
956
+ output_attentions=output_attentions,
957
+ output_hidden_states=output_hidden_states,
958
+ return_dict=return_dict,
959
+ )
960
+
961
+ sequence_output = outputs[0]
962
+ prediction_scores = self.lm_head(sequence_output)
963
+
964
+ lm_loss = None
965
+ if labels is not None:
966
+ # move labels to correct device to enable model parallelism
967
+ labels = labels.to(prediction_scores.device)
968
+ # we are doing next-token prediction; shift prediction scores and input ids by one
969
+ shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
970
+ labels = labels[:, 1:].contiguous()
971
+ loss_fct = CrossEntropyLoss()
972
+ lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
973
+
974
+ if not return_dict:
975
+ output = (prediction_scores,) + outputs[2:]
976
+ return ((lm_loss,) + output) if lm_loss is not None else output
977
+
978
+ return CausalLMOutputWithCrossAttentions(
979
+ loss=lm_loss,
980
+ logits=prediction_scores,
981
+ past_key_values=outputs.past_key_values,
982
+ hidden_states=outputs.hidden_states,
983
+ attentions=outputs.attentions,
984
+ cross_attentions=outputs.cross_attentions,
985
+ )
986
+
987
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
988
+ input_shape = input_ids.shape
989
+ # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
990
+ if attention_mask is None:
991
+ attention_mask = input_ids.new_ones(input_shape)
992
+
993
+ # cut decoder_input_ids if past_key_values is used
994
+ if past_key_values is not None:
995
+ past_length = past_key_values[0][0].shape[2]
996
+
997
+ # Some generation methods already pass only the last input ID
998
+ if input_ids.shape[1] > past_length:
999
+ remove_prefix_length = past_length
1000
+ else:
1001
+ # Default to old behavior: keep only final ID
1002
+ remove_prefix_length = input_ids.shape[1] - 1
1003
+
1004
+ input_ids = input_ids[:, remove_prefix_length:]
1005
+
1006
+ return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
1007
+
1008
+ def _reorder_cache(self, past_key_values, beam_idx):
1009
+ reordered_past = ()
1010
+ for layer_past in past_key_values:
1011
+ reordered_past += (
1012
+ tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1013
+ )
1014
+ return reordered_past
1015
+
1016
+
1017
+ @add_start_docstrings(
1018
+ """Bilingual Model with a `language modeling` head on top.""",
1019
+ BILINGUAL_START_DOCSTRING,
1020
+ )
1021
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Bilingual, ROBERTA->BILINGUAL
1022
+ class BilingualForMaskedLM(BilingualPreTrainedModel):
1023
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1024
+
1025
+ def __init__(self, config):
1026
+ super().__init__(config)
1027
+
1028
+ if config.is_decoder:
1029
+ logger.warning(
1030
+ "If you want to use `BilingualForMaskedLM` make sure `config.is_decoder=False` for "
1031
+ "bi-directional self-attention."
1032
+ )
1033
+
1034
+ self.roberta = BilingualModel(config, add_pooling_layer=False)
1035
+ self.lm_head = BilingualLMHead(config)
1036
+
1037
+ # Initialize weights and apply final processing
1038
+ self.post_init()
1039
+
1040
+ def get_output_embeddings(self):
1041
+ return self.lm_head.decoder
1042
+
1043
+ def set_output_embeddings(self, new_embeddings):
1044
+ self.lm_head.decoder = new_embeddings
1045
+
1046
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1047
+ @add_code_sample_docstrings(
1048
+ checkpoint=_CHECKPOINT_FOR_DOC,
1049
+ output_type=MaskedLMOutput,
1050
+ config_class=_CONFIG_FOR_DOC,
1051
+ mask="<mask>",
1052
+ expected_output="' Paris'",
1053
+ expected_loss=0.1,
1054
+ )
1055
+ def forward(
1056
+ self,
1057
+ input_ids: Optional[torch.LongTensor] = None,
1058
+ attention_mask: Optional[torch.FloatTensor] = None,
1059
+ token_type_ids: Optional[torch.LongTensor] = None,
1060
+ position_ids: Optional[torch.LongTensor] = None,
1061
+ head_mask: Optional[torch.FloatTensor] = None,
1062
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1063
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
1064
+ encoder_attention_mask: Optional[torch.FloatTensor] = None,
1065
+ labels: Optional[torch.LongTensor] = None,
1066
+ output_attentions: Optional[bool] = None,
1067
+ output_hidden_states: Optional[bool] = None,
1068
+ return_dict: Optional[bool] = None,
1069
+ ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]:
1070
+ r"""
1071
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1072
+ Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
1073
+ config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
1074
+ loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
1075
+ kwargs (`Dict[str, any]`, optional, defaults to *{}*):
1076
+ Used to hide legacy arguments that have been deprecated.
1077
+ """
1078
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1079
+
1080
+ outputs = self.roberta(
1081
+ input_ids,
1082
+ attention_mask=attention_mask,
1083
+ token_type_ids=token_type_ids,
1084
+ position_ids=position_ids,
1085
+ head_mask=head_mask,
1086
+ inputs_embeds=inputs_embeds,
1087
+ encoder_hidden_states=encoder_hidden_states,
1088
+ encoder_attention_mask=encoder_attention_mask,
1089
+ output_attentions=output_attentions,
1090
+ output_hidden_states=output_hidden_states,
1091
+ return_dict=return_dict,
1092
+ )
1093
+ sequence_output = outputs[0]
1094
+ prediction_scores = self.lm_head(sequence_output)
1095
+
1096
+ masked_lm_loss = None
1097
+ if labels is not None:
1098
+ # move labels to correct device to enable model parallelism
1099
+ labels = labels.to(prediction_scores.device)
1100
+ loss_fct = CrossEntropyLoss()
1101
+ masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
1102
+
1103
+ if not return_dict:
1104
+ output = (prediction_scores,) + outputs[2:]
1105
+ return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
1106
+
1107
+ return MaskedLMOutput(
1108
+ loss=masked_lm_loss,
1109
+ logits=prediction_scores,
1110
+ hidden_states=outputs.hidden_states,
1111
+ attentions=outputs.attentions,
1112
+ )
1113
+
1114
+
1115
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead
1116
+ class BilingualLMHead(nn.Module):
1117
+ """Roberta Head for masked language modeling."""
1118
+
1119
+ def __init__(self, config):
1120
+ super().__init__()
1121
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1122
+ self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
1123
+
1124
+ self.decoder = nn.Linear(config.hidden_size, config.vocab_size)
1125
+ self.bias = nn.Parameter(torch.zeros(config.vocab_size))
1126
+ self.decoder.bias = self.bias
1127
+
1128
+ def forward(self, features, **kwargs):
1129
+ x = self.dense(features)
1130
+ x = gelu(x)
1131
+ x = self.layer_norm(x)
1132
+
1133
+ # project back to size of vocabulary with bias
1134
+ x = self.decoder(x)
1135
+
1136
+ return x
1137
+
1138
+ def _tie_weights(self):
1139
+ # To tie those two weights if they get disconnected (on TPU or when the bias is resized)
1140
+ # For accelerate compatibility and to not break backward compatibility
1141
+ if self.decoder.bias.device.type == "meta":
1142
+ self.decoder.bias = self.bias
1143
+ else:
1144
+ self.bias = self.decoder.bias
1145
+
1146
+
1147
+ @add_start_docstrings(
1148
+ """
1149
+ Bilingual Model transformer with a sequence classification/regression head on top (a linear layer on top of the
1150
+ pooled output) e.g. for GLUE tasks.
1151
+ """,
1152
+ BILINGUAL_START_DOCSTRING,
1153
+ )
1154
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Bilingual, ROBERTA->BILINGUAL
1155
+ class BilingualForSequenceClassification(BilingualPreTrainedModel):
1156
+ def __init__(self, config):
1157
+ super().__init__(config)
1158
+ self.num_labels = config.num_labels
1159
+ self.config = config
1160
+
1161
+ self.roberta = BilingualModel(config, add_pooling_layer=False)
1162
+ self.classifier = BilingualClassificationHead(config)
1163
+
1164
+ # Initialize weights and apply final processing
1165
+ self.post_init()
1166
+
1167
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1168
+ @add_code_sample_docstrings(
1169
+ checkpoint="cardiffnlp/twitter-roberta-base-emotion",
1170
+ output_type=SequenceClassifierOutput,
1171
+ config_class=_CONFIG_FOR_DOC,
1172
+ expected_output="'optimism'",
1173
+ expected_loss=0.08,
1174
+ )
1175
+ def forward(
1176
+ self,
1177
+ input_ids: Optional[torch.LongTensor] = None,
1178
+ attention_mask: Optional[torch.FloatTensor] = None,
1179
+ token_type_ids: Optional[torch.LongTensor] = None,
1180
+ position_ids: Optional[torch.LongTensor] = None,
1181
+ head_mask: Optional[torch.FloatTensor] = None,
1182
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1183
+ labels: Optional[torch.LongTensor] = None,
1184
+ output_attentions: Optional[bool] = None,
1185
+ output_hidden_states: Optional[bool] = None,
1186
+ return_dict: Optional[bool] = None,
1187
+ ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]:
1188
+ r"""
1189
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1190
+ Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1191
+ config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1192
+ `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1193
+ """
1194
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1195
+
1196
+ outputs = self.roberta(
1197
+ input_ids,
1198
+ attention_mask=attention_mask,
1199
+ token_type_ids=token_type_ids,
1200
+ position_ids=position_ids,
1201
+ head_mask=head_mask,
1202
+ inputs_embeds=inputs_embeds,
1203
+ output_attentions=output_attentions,
1204
+ output_hidden_states=output_hidden_states,
1205
+ return_dict=return_dict,
1206
+ )
1207
+ sequence_output = outputs[0]
1208
+ logits = self.classifier(sequence_output)
1209
+
1210
+ loss = None
1211
+ if labels is not None:
1212
+ # move labels to correct device to enable model parallelism
1213
+ labels = labels.to(logits.device)
1214
+ if self.config.problem_type is None:
1215
+ if self.num_labels == 1:
1216
+ self.config.problem_type = "regression"
1217
+ elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
1218
+ self.config.problem_type = "single_label_classification"
1219
+ else:
1220
+ self.config.problem_type = "multi_label_classification"
1221
+
1222
+ if self.config.problem_type == "regression":
1223
+ loss_fct = MSELoss()
1224
+ if self.num_labels == 1:
1225
+ loss = loss_fct(logits.squeeze(), labels.squeeze())
1226
+ else:
1227
+ loss = loss_fct(logits, labels)
1228
+ elif self.config.problem_type == "single_label_classification":
1229
+ loss_fct = CrossEntropyLoss()
1230
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1231
+ elif self.config.problem_type == "multi_label_classification":
1232
+ loss_fct = BCEWithLogitsLoss()
1233
+ loss = loss_fct(logits, labels)
1234
+
1235
+ if not return_dict:
1236
+ output = (logits,) + outputs[2:]
1237
+ return ((loss,) + output) if loss is not None else output
1238
+
1239
+ return SequenceClassifierOutput(
1240
+ loss=loss,
1241
+ logits=logits,
1242
+ hidden_states=outputs.hidden_states,
1243
+ attentions=outputs.attentions,
1244
+ )
1245
+
1246
+
1247
+ @add_start_docstrings(
1248
+ """
1249
+ Bilingual Model with a multiple choice classification head on top (a linear layer on top of the pooled output and
1250
+ a softmax) e.g. for RocStories/SWAG tasks.
1251
+ """,
1252
+ BILINGUAL_START_DOCSTRING,
1253
+ )
1254
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Bilingual, ROBERTA->BILINGUAL
1255
+ class BilingualForMultipleChoice(BilingualPreTrainedModel):
1256
+ def __init__(self, config):
1257
+ super().__init__(config)
1258
+
1259
+ self.roberta = BilingualModel(config)
1260
+ self.dropout = nn.Dropout(config.hidden_dropout_prob)
1261
+ self.classifier = nn.Linear(config.hidden_size, 1)
1262
+
1263
+ # Initialize weights and apply final processing
1264
+ self.post_init()
1265
+
1266
+ @add_start_docstrings_to_model_forward(
1267
+ BILINGUAL_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
1268
+ )
1269
+ @add_code_sample_docstrings(
1270
+ checkpoint=_CHECKPOINT_FOR_DOC,
1271
+ output_type=MultipleChoiceModelOutput,
1272
+ config_class=_CONFIG_FOR_DOC,
1273
+ )
1274
+ def forward(
1275
+ self,
1276
+ input_ids: Optional[torch.LongTensor] = None,
1277
+ token_type_ids: Optional[torch.LongTensor] = None,
1278
+ attention_mask: Optional[torch.FloatTensor] = None,
1279
+ labels: Optional[torch.LongTensor] = None,
1280
+ position_ids: Optional[torch.LongTensor] = None,
1281
+ head_mask: Optional[torch.FloatTensor] = None,
1282
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1283
+ output_attentions: Optional[bool] = None,
1284
+ output_hidden_states: Optional[bool] = None,
1285
+ return_dict: Optional[bool] = None,
1286
+ ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]:
1287
+ r"""
1288
+ labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1289
+ Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
1290
+ num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
1291
+ `input_ids` above)
1292
+ """
1293
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1294
+ num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
1295
+
1296
+ flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
1297
+ flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
1298
+ flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
1299
+ flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
1300
+ flat_inputs_embeds = (
1301
+ inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
1302
+ if inputs_embeds is not None
1303
+ else None
1304
+ )
1305
+
1306
+ outputs = self.roberta(
1307
+ flat_input_ids,
1308
+ position_ids=flat_position_ids,
1309
+ token_type_ids=flat_token_type_ids,
1310
+ attention_mask=flat_attention_mask,
1311
+ head_mask=head_mask,
1312
+ inputs_embeds=flat_inputs_embeds,
1313
+ output_attentions=output_attentions,
1314
+ output_hidden_states=output_hidden_states,
1315
+ return_dict=return_dict,
1316
+ )
1317
+ pooled_output = outputs[1]
1318
+
1319
+ pooled_output = self.dropout(pooled_output)
1320
+ logits = self.classifier(pooled_output)
1321
+ reshaped_logits = logits.view(-1, num_choices)
1322
+
1323
+ loss = None
1324
+ if labels is not None:
1325
+ # move labels to correct device to enable model parallelism
1326
+ labels = labels.to(reshaped_logits.device)
1327
+ loss_fct = CrossEntropyLoss()
1328
+ loss = loss_fct(reshaped_logits, labels)
1329
+
1330
+ if not return_dict:
1331
+ output = (reshaped_logits,) + outputs[2:]
1332
+ return ((loss,) + output) if loss is not None else output
1333
+
1334
+ return MultipleChoiceModelOutput(
1335
+ loss=loss,
1336
+ logits=reshaped_logits,
1337
+ hidden_states=outputs.hidden_states,
1338
+ attentions=outputs.attentions,
1339
+ )
1340
+
1341
+
1342
+ @add_start_docstrings(
1343
+ """
1344
+ Bilingual Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
1345
+ for Named-Entity-Recognition (NER) tasks.
1346
+ """,
1347
+ BILINGUAL_START_DOCSTRING,
1348
+ )
1349
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Bilingual, ROBERTA->BILINGUAL
1350
+ class BilingualForTokenClassification(BilingualPreTrainedModel):
1351
+ def __init__(self, config):
1352
+ super().__init__(config)
1353
+ self.num_labels = config.num_labels
1354
+
1355
+ self.roberta = BilingualModel(config, add_pooling_layer=False)
1356
+ classifier_dropout = (
1357
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1358
+ )
1359
+ self.dropout = nn.Dropout(classifier_dropout)
1360
+ self.classifier = nn.Linear(config.hidden_size, config.num_labels)
1361
+
1362
+ # Initialize weights and apply final processing
1363
+ self.post_init()
1364
+
1365
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1366
+ @add_code_sample_docstrings(
1367
+ checkpoint="Jean-Baptiste/roberta-large-ner-english",
1368
+ output_type=TokenClassifierOutput,
1369
+ config_class=_CONFIG_FOR_DOC,
1370
+ expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']",
1371
+ expected_loss=0.01,
1372
+ )
1373
+ def forward(
1374
+ self,
1375
+ input_ids: Optional[torch.LongTensor] = None,
1376
+ attention_mask: Optional[torch.FloatTensor] = None,
1377
+ token_type_ids: Optional[torch.LongTensor] = None,
1378
+ position_ids: Optional[torch.LongTensor] = None,
1379
+ head_mask: Optional[torch.FloatTensor] = None,
1380
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1381
+ labels: Optional[torch.LongTensor] = None,
1382
+ output_attentions: Optional[bool] = None,
1383
+ output_hidden_states: Optional[bool] = None,
1384
+ return_dict: Optional[bool] = None,
1385
+ ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
1386
+ r"""
1387
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1388
+ Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
1389
+ """
1390
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1391
+
1392
+ outputs = self.roberta(
1393
+ input_ids,
1394
+ attention_mask=attention_mask,
1395
+ token_type_ids=token_type_ids,
1396
+ position_ids=position_ids,
1397
+ head_mask=head_mask,
1398
+ inputs_embeds=inputs_embeds,
1399
+ output_attentions=output_attentions,
1400
+ output_hidden_states=output_hidden_states,
1401
+ return_dict=return_dict,
1402
+ )
1403
+
1404
+ sequence_output = outputs[0]
1405
+
1406
+ sequence_output = self.dropout(sequence_output)
1407
+ logits = self.classifier(sequence_output)
1408
+
1409
+ loss = None
1410
+ if labels is not None:
1411
+ # move labels to correct device to enable model parallelism
1412
+ labels = labels.to(logits.device)
1413
+ loss_fct = CrossEntropyLoss()
1414
+ loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1415
+
1416
+ if not return_dict:
1417
+ output = (logits,) + outputs[2:]
1418
+ return ((loss,) + output) if loss is not None else output
1419
+
1420
+ return TokenClassifierOutput(
1421
+ loss=loss,
1422
+ logits=logits,
1423
+ hidden_states=outputs.hidden_states,
1424
+ attentions=outputs.attentions,
1425
+ )
1426
+
1427
+
1428
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Bilingual
1429
+ class BilingualClassificationHead(nn.Module):
1430
+ """Roberta Head for sentence-level classification tasks."""
1431
+
1432
+ def __init__(self, config):
1433
+ super().__init__()
1434
+ self.dense = nn.Linear(config.hidden_size, config.hidden_size)
1435
+ classifier_dropout = (
1436
+ config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
1437
+ )
1438
+ self.dropout = nn.Dropout(classifier_dropout)
1439
+ self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
1440
+
1441
+ def forward(self, features, **kwargs):
1442
+ x = features[:, 0, :] # take <s> token (equiv. to [CLS])
1443
+ x = self.dropout(x)
1444
+ x = self.dense(x)
1445
+ x = torch.tanh(x)
1446
+ x = self.dropout(x)
1447
+ x = self.out_proj(x)
1448
+ return x
1449
+
1450
+
1451
+ @add_start_docstrings(
1452
+ """
1453
+ Bilingual Model with a span classification head on top for extractive question-answering tasks like SQuAD (a
1454
+ linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
1455
+ """,
1456
+ BILINGUAL_START_DOCSTRING,
1457
+ )
1458
+ # Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Bilingual, ROBERTA->BILINGUAL
1459
+ class BilingualForQuestionAnswering(BilingualPreTrainedModel):
1460
+ def __init__(self, config):
1461
+ super().__init__(config)
1462
+ self.num_labels = config.num_labels
1463
+
1464
+ self.roberta = BilingualModel(config, add_pooling_layer=False)
1465
+ self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
1466
+
1467
+ # Initialize weights and apply final processing
1468
+ self.post_init()
1469
+
1470
+ @add_start_docstrings_to_model_forward(BILINGUAL_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
1471
+ @add_code_sample_docstrings(
1472
+ checkpoint="deepset/roberta-base-squad2",
1473
+ output_type=QuestionAnsweringModelOutput,
1474
+ config_class=_CONFIG_FOR_DOC,
1475
+ expected_output="' puppet'",
1476
+ expected_loss=0.86,
1477
+ )
1478
+ def forward(
1479
+ self,
1480
+ input_ids: Optional[torch.LongTensor] = None,
1481
+ attention_mask: Optional[torch.FloatTensor] = None,
1482
+ token_type_ids: Optional[torch.LongTensor] = None,
1483
+ position_ids: Optional[torch.LongTensor] = None,
1484
+ head_mask: Optional[torch.FloatTensor] = None,
1485
+ inputs_embeds: Optional[torch.FloatTensor] = None,
1486
+ start_positions: Optional[torch.LongTensor] = None,
1487
+ end_positions: Optional[torch.LongTensor] = None,
1488
+ output_attentions: Optional[bool] = None,
1489
+ output_hidden_states: Optional[bool] = None,
1490
+ return_dict: Optional[bool] = None,
1491
+ ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]:
1492
+ r"""
1493
+ start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1494
+ Labels for position (index) of the start of the labelled span for computing the token classification loss.
1495
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1496
+ are not taken into account for computing the loss.
1497
+ end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1498
+ Labels for position (index) of the end of the labelled span for computing the token classification loss.
1499
+ Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1500
+ are not taken into account for computing the loss.
1501
+ """
1502
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1503
+
1504
+ outputs = self.roberta(
1505
+ input_ids,
1506
+ attention_mask=attention_mask,
1507
+ token_type_ids=token_type_ids,
1508
+ position_ids=position_ids,
1509
+ head_mask=head_mask,
1510
+ inputs_embeds=inputs_embeds,
1511
+ output_attentions=output_attentions,
1512
+ output_hidden_states=output_hidden_states,
1513
+ return_dict=return_dict,
1514
+ )
1515
+
1516
+ sequence_output = outputs[0]
1517
+
1518
+ logits = self.qa_outputs(sequence_output)
1519
+ start_logits, end_logits = logits.split(1, dim=-1)
1520
+ start_logits = start_logits.squeeze(-1).contiguous()
1521
+ end_logits = end_logits.squeeze(-1).contiguous()
1522
+
1523
+ total_loss = None
1524
+ if start_positions is not None and end_positions is not None:
1525
+ # If we are on multi-GPU, split add a dimension
1526
+ if len(start_positions.size()) > 1:
1527
+ start_positions = start_positions.squeeze(-1)
1528
+ if len(end_positions.size()) > 1:
1529
+ end_positions = end_positions.squeeze(-1)
1530
+ # sometimes the start/end positions are outside our model inputs, we ignore these terms
1531
+ ignored_index = start_logits.size(1)
1532
+ start_positions = start_positions.clamp(0, ignored_index)
1533
+ end_positions = end_positions.clamp(0, ignored_index)
1534
+
1535
+ loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1536
+ start_loss = loss_fct(start_logits, start_positions)
1537
+ end_loss = loss_fct(end_logits, end_positions)
1538
+ total_loss = (start_loss + end_loss) / 2
1539
+
1540
+ if not return_dict:
1541
+ output = (start_logits, end_logits) + outputs[2:]
1542
+ return ((total_loss,) + output) if total_loss is not None else output
1543
+
1544
+ return QuestionAnsweringModelOutput(
1545
+ loss=total_loss,
1546
+ start_logits=start_logits,
1547
+ end_logits=end_logits,
1548
+ hidden_states=outputs.hidden_states,
1549
+ attentions=outputs.attentions,
1550
+ )
1551
+
1552
+
1553
+ # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
1554
+ def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
1555
+ """
1556
+ Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
1557
+ are ignored. This is modified from fairseq's `utils.make_positions`.
1558
+
1559
+ Args:
1560
+ x: torch.Tensor x:
1561
+
1562
+ Returns: torch.Tensor
1563
+ """
1564
+ # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
1565
+ mask = input_ids.ne(padding_idx).int()
1566
+ incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
1567
+ return incremental_indices.long() + padding_idx