ccdv commited on
Commit
5ef7af7
1 Parent(s): 4488494

update for transformers >= 4.29.1

Browse files
Files changed (1) hide show
  1. modeling_lsg_xlm_roberta.py +16 -20
modeling_lsg_xlm_roberta.py CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
188
  del key_layer
189
 
190
  if attention_mask is not None:
191
- # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
192
- attention_scores = attention_scores + attention_mask
193
-
194
  # Add causal mask
195
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
196
  causal_mask = torch.tril(
197
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
198
  diagonal=-1
199
  )
200
- causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
201
- attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
 
 
 
 
 
 
 
202
 
 
203
  del attention_mask
 
204
 
205
  # Normalize the attention scores to probabilities.
206
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -971,6 +977,9 @@ class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
971
  """
972
 
973
  config_class = LSGXLMRobertaConfig
 
 
 
974
 
975
  def _set_gradient_checkpointing(self, module, value=False):
976
  if isinstance(module, (RobertaEncoder, LSGRobertaEncoder)):
@@ -983,9 +992,6 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
983
  documentation alongside usage examples.
984
  """
985
 
986
- config_class = LSGXLMRobertaConfig
987
-
988
-
989
  def __init__(self, config, add_pooling_layer=True):
990
 
991
  LSGRobertaPreTrainedModel.__init__(self, config)
@@ -1022,10 +1028,7 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1022
 
1023
  class LSGXLMRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
1024
 
1025
- config_class = LSGXLMRobertaConfig
1026
- _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1027
- _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1028
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1029
 
1030
  def __init__(self, config):
1031
 
@@ -1053,6 +1056,7 @@ class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1053
  _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1054
  _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1055
  _keys_to_ignore_on_load_unexpected = [r"pooler"]
 
1056
 
1057
  def __init__(self, config):
1058
 
@@ -1079,8 +1083,6 @@ class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaF
1079
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
1080
  appropriate documentation alongside usage examples.
1081
  """
1082
- config_class = LSGXLMRobertaConfig
1083
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1084
 
1085
  def __init__(self, config):
1086
 
@@ -1121,9 +1123,6 @@ class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForT
1121
  This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
1122
  appropriate documentation alongside usage examples.
1123
  """
1124
- config_class = LSGXLMRobertaConfig
1125
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1126
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1127
 
1128
  def __init__(self, config):
1129
 
@@ -1147,9 +1146,6 @@ class LSGXLMRobertaForQuestionAnswering(LSGRobertaPreTrainedModel, RobertaForQue
1147
  This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
1148
  appropriate documentation alongside usage examples.
1149
  """
1150
- config_class = LSGXLMRobertaConfig
1151
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1152
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1153
 
1154
  def __init__(self, config):
1155
 
 
188
  del key_layer
189
 
190
  if attention_mask is not None:
 
 
 
191
  # Add causal mask
192
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
193
  causal_mask = torch.tril(
194
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
195
  diagonal=-1
196
  )
197
+
198
+ # Min value
199
+ dtype_min = torch.tensor(
200
+ torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
201
+ )
202
+
203
+ # Build causal + attention_mask
204
+ causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
205
+ attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
206
 
207
+ attention_scores = attention_scores + attention_mask
208
  del attention_mask
209
+ del causal_mask
210
 
211
  # Normalize the attention scores to probabilities.
212
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
 
977
  """
978
 
979
  config_class = LSGXLMRobertaConfig
980
+ base_model_prefix = "roberta"
981
+ supports_gradient_checkpointing = True
982
+ _no_split_modules = []
983
 
984
  def _set_gradient_checkpointing(self, module, value=False):
985
  if isinstance(module, (RobertaEncoder, LSGRobertaEncoder)):
 
992
  documentation alongside usage examples.
993
  """
994
 
 
 
 
995
  def __init__(self, config, add_pooling_layer=True):
996
 
997
  LSGRobertaPreTrainedModel.__init__(self, config)
 
1028
 
1029
  class LSGXLMRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
1030
 
1031
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
 
 
1032
 
1033
  def __init__(self, config):
1034
 
 
1056
  _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1057
  _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1058
  _keys_to_ignore_on_load_unexpected = [r"pooler"]
1059
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1060
 
1061
  def __init__(self, config):
1062
 
 
1083
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
1084
  appropriate documentation alongside usage examples.
1085
  """
 
 
1086
 
1087
  def __init__(self, config):
1088
 
 
1123
  This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
1124
  appropriate documentation alongside usage examples.
1125
  """
 
 
 
1126
 
1127
  def __init__(self, config):
1128
 
 
1146
  This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
1147
  appropriate documentation alongside usage examples.
1148
  """
 
 
 
1149
 
1150
  def __init__(self, config):
1151