ccdv commited on
Commit
728b7d8
1 Parent(s): 543919c

update for transformers >= 4.29.1

Browse files
Files changed (1) hide show
  1. modeling_lsg_albert.py +14 -11
modeling_lsg_albert.py CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
188
  del key_layer
189
 
190
  if attention_mask is not None:
191
- # Apply the attention mask is (precomputed for all layers in AlbertModel forward() function)
192
- attention_scores = attention_scores + attention_mask
193
-
194
  # Add causal mask
195
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
196
  causal_mask = torch.tril(
197
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
198
  diagonal=-1
199
  )
200
- causal_mask = causal_mask.T * torch.finfo(attention_scores.dtype).min
201
- attention_scores[..., -causal_shape[0]:, -causal_shape[1] + 1:] = causal_mask[:, 1:]
 
 
 
 
 
 
 
202
 
 
203
  del attention_mask
 
204
 
205
  # Normalize the attention scores to probabilities.
206
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
@@ -838,7 +844,6 @@ class LSGAlbertPreTrainedModel(PreTrainedModel):
838
  config_class = LSGAlbertConfig
839
  load_tf_weights = load_tf_weights_in_albert
840
  base_model_prefix = "albert"
841
- _keys_to_ignore_on_load_missing = [r"position_ids"]
842
 
843
  def _init_weights(self, module):
844
  """Initialize the weights."""
@@ -881,6 +886,8 @@ class LSGAlbertModel(LSGAlbertPreTrainedModel, AlbertModel):
881
 
882
  class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
883
 
 
 
884
  def __init__(self, config):
885
 
886
  LSGAlbertPreTrainedModel.__init__(self, config)
@@ -895,7 +902,7 @@ class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
895
 
896
  class LSGAlbertForMaskedLM(LSGAlbertPreTrainedModel, AlbertForMaskedLM):
897
 
898
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
899
 
900
  def __init__(self, config):
901
  LSGAlbertPreTrainedModel.__init__(self, config)
@@ -925,8 +932,6 @@ class LSGAlbertForSequenceClassification(LSGAlbertPreTrainedModel, AlbertForSequ
925
 
926
  class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenClassification):
927
 
928
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
929
-
930
  def __init__(self, config):
931
 
932
  LSGAlbertPreTrainedModel.__init__(self, config)
@@ -947,8 +952,6 @@ class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenCl
947
 
948
  class LSGAlbertForQuestionAnswering(LSGAlbertPreTrainedModel, AlbertForQuestionAnswering):
949
 
950
- _keys_to_ignore_on_load_unexpected = [r"pooler"]
951
-
952
  def __init__(self, config):
953
 
954
  LSGAlbertPreTrainedModel.__init__(self, config)
 
188
  del key_layer
189
 
190
  if attention_mask is not None:
 
 
 
191
  # Add causal mask
192
  causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
193
  causal_mask = torch.tril(
194
  torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
195
  diagonal=-1
196
  )
197
+
198
+ # Min value
199
+ dtype_min = torch.tensor(
200
+ torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
201
+ )
202
+
203
+ # Build causal + attention_mask
204
+ causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
205
+ attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
206
 
207
+ attention_scores = attention_scores + attention_mask
208
  del attention_mask
209
+ del causal_mask
210
 
211
  # Normalize the attention scores to probabilities.
212
  attention_probs = nn.Softmax(dim=-1)(attention_scores)
 
844
  config_class = LSGAlbertConfig
845
  load_tf_weights = load_tf_weights_in_albert
846
  base_model_prefix = "albert"
 
847
 
848
  def _init_weights(self, module):
849
  """Initialize the weights."""
 
886
 
887
  class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
888
 
889
+ _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
890
+
891
  def __init__(self, config):
892
 
893
  LSGAlbertPreTrainedModel.__init__(self, config)
 
902
 
903
  class LSGAlbertForMaskedLM(LSGAlbertPreTrainedModel, AlbertForMaskedLM):
904
 
905
+ _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
906
 
907
  def __init__(self, config):
908
  LSGAlbertPreTrainedModel.__init__(self, config)
 
932
 
933
  class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenClassification):
934
 
 
 
935
  def __init__(self, config):
936
 
937
  LSGAlbertPreTrainedModel.__init__(self, config)
 
952
 
953
  class LSGAlbertForQuestionAnswering(LSGAlbertPreTrainedModel, AlbertForQuestionAnswering):
954
 
 
 
955
  def __init__(self, config):
956
 
957
  LSGAlbertPreTrainedModel.__init__(self, config)