ccdv commited on
Commit
a543eae
1 Parent(s): cb91b4f

update for transformers >= 4.29.1

Browse files
Files changed (1) hide show
  1. modeling_lsg_camembert.py +17 -24
modeling_lsg_camembert.py CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
188
  del key_layer
189
 
190
  if attention_mask is not None:
191
- # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function)
192
- attention_scores = attention_scores + attention_mask
193
-
194
  # Add causal mask
195
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
196
  causal_mask = torch.tril(
197
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
198
  diagonal=-1
199
  )
200
- causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
201
- attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
 
 
 
 
 
 
 
202
 
 
203
  del attention_mask
 
204
 
205
  # Normalize the attention scores to probabilities.
206
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -974,6 +980,8 @@ class LSGCamembertPreTrainedModel(CamembertPreTrainedModel):
974
  """
975
 
976
  config_class = LSGCamembertConfig
 
 
977
 
978
  def _set_gradient_checkpointing(self, module, value=False):
979
  if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
@@ -986,8 +994,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
986
  documentation alongside usage examples.
987
  """
988
 
989
- config_class = LSGCamembertConfig
990
-
991
 
992
  def __init__(self, config, add_pooling_layer=True):
993
 
@@ -1025,9 +1032,7 @@ class LSGCamembertModel(LSGCamembertPreTrainedModel, CamembertModel):
1025
 
1026
  class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
1027
 
1028
- _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1029
- _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1030
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1031
 
1032
  def __init__(self, config):
1033
 
@@ -1052,9 +1057,7 @@ class LSGCamembertForMaskedLM(LSGCamembertPreTrainedModel, CamembertForMaskedLM)
1052
  documentation alongside usage examples.
1053
  """
1054
 
1055
- _keys_to_ignore_on_save = [r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1056
- _keys_to_ignore_on_load_missing = [r"position_ids", r"lm_head.decoder.weight", r"lm_head.decoder.bias"]
1057
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1058
 
1059
  def __init__(self, config):
1060
 
@@ -1082,8 +1085,6 @@ class LSGCamembertForSequenceClassification(LSGCamembertPreTrainedModel, Camembe
1082
  appropriate documentation alongside usage examples.
1083
  """
1084
 
1085
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1086
-
1087
  def __init__(self, config):
1088
 
1089
  LSGCamembertPreTrainedModel.__init__(self, config)
@@ -1104,8 +1105,6 @@ class LSGCamembertForMultipleChoice(LSGCamembertPreTrainedModel, CamembertForMul
1104
  appropriate documentation alongside usage examples.
1105
  """
1106
 
1107
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1108
-
1109
  def __init__(self, config):
1110
 
1111
  LSGCamembertPreTrainedModel.__init__(self, config)
@@ -1124,9 +1123,6 @@ class LSGCamembertForTokenClassification(LSGCamembertPreTrainedModel, CamembertF
1124
  appropriate documentation alongside usage examples.
1125
  """
1126
 
1127
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1128
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1129
-
1130
  def __init__(self, config):
1131
 
1132
  LSGCamembertPreTrainedModel.__init__(self, config)
@@ -1150,9 +1146,6 @@ class LSGCamembertForQuestionAnswering(LSGCamembertPreTrainedModel, CamembertFor
1150
  appropriate documentation alongside usage examples.
1151
  """
1152
 
1153
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
1154
- _keys_to_ignore_on_load_missing = [r"position_ids"]
1155
-
1156
  def __init__(self, config):
1157
 
1158
  LSGCamembertPreTrainedModel.__init__(self, config)
@@ -1176,4 +1169,4 @@ try:
1176
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1177
  except:
1178
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1179
- warn("Update to transformers >= 4.17.0 to fix.")
 
188
  del key_layer
189
 
190
  if attention_mask is not None:
 
 
 
191
  # Add causal mask
192
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
193
  causal_mask = torch.tril(
194
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
195
  diagonal=-1
196
  )
197
+
198
+ # Min value
199
+ dtype_min = torch.tensor(
200
+ torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
201
+ )
202
+
203
+ # Build causal + attention_mask
204
+ causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
205
+ attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
206
 
207
+ attention_scores = attention_scores + attention_mask
208
  del attention_mask
209
+ del causal_mask
210
 
211
  # Normalize the attention scores to probabilities.
212
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
 
980
  """
981
 
982
  config_class = LSGCamembertConfig
983
+ base_model_prefix = "roberta"
984
+ supports_gradient_checkpointing = True
985
 
986
  def _set_gradient_checkpointing(self, module, value=False):
987
  if isinstance(module, (CamembertEncoder, LSGCamembertEncoder)):
 
994
  documentation alongside usage examples.
995
  """
996
 
997
+ _no_split_modules = []
 
998
 
999
  def __init__(self, config, add_pooling_layer=True):
1000
 
 
1032
 
1033
  class LSGCamembertForCausalLM(LSGCamembertPreTrainedModel, CamembertForCausalLM):
1034
 
1035
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
 
1036
 
1037
  def __init__(self, config):
1038
 
 
1057
  documentation alongside usage examples.
1058
  """
1059
 
1060
+ _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
 
 
1061
 
1062
  def __init__(self, config):
1063
 
 
1085
  appropriate documentation alongside usage examples.
1086
  """
1087
 
 
 
1088
  def __init__(self, config):
1089
 
1090
  LSGCamembertPreTrainedModel.__init__(self, config)
 
1105
  appropriate documentation alongside usage examples.
1106
  """
1107
 
 
 
1108
  def __init__(self, config):
1109
 
1110
  LSGCamembertPreTrainedModel.__init__(self, config)
 
1123
  appropriate documentation alongside usage examples.
1124
  """
1125
 
 
 
 
1126
  def __init__(self, config):
1127
 
1128
  LSGCamembertPreTrainedModel.__init__(self, config)
 
1146
  appropriate documentation alongside usage examples.
1147
  """
1148
 
 
 
 
1149
  def __init__(self, config):
1150
 
1151
  LSGCamembertPreTrainedModel.__init__(self, config)
 
1169
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1170
  except:
1171
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1172
+ warn("Update to transformers >= 4.23.1 to fix.")