update for transformers >= 4.29.1
Browse files- modeling_lsg_albert.py +14 -11
modeling_lsg_albert.py
CHANGED
@@ -188,19 +188,25 @@ class CausalAttentionProduct(nn.Module):
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
191 |
-
# Apply the attention mask is (precomputed for all layers in AlbertModel forward() function)
|
192 |
-
attention_scores = attention_scores + attention_mask
|
193 |
-
|
194 |
# Add causal mask
|
195 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
196 |
causal_mask = torch.tril(
|
197 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
198 |
diagonal=-1
|
199 |
)
|
200 |
-
|
201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
|
|
|
203 |
del attention_mask
|
|
|
204 |
|
205 |
# Normalize the attention scores to probabilities.
|
206 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
@@ -838,7 +844,6 @@ class LSGAlbertPreTrainedModel(PreTrainedModel):
|
|
838 |
config_class = LSGAlbertConfig
|
839 |
load_tf_weights = load_tf_weights_in_albert
|
840 |
base_model_prefix = "albert"
|
841 |
-
_keys_to_ignore_on_load_missing = [r"position_ids"]
|
842 |
|
843 |
def _init_weights(self, module):
|
844 |
"""Initialize the weights."""
|
@@ -881,6 +886,8 @@ class LSGAlbertModel(LSGAlbertPreTrainedModel, AlbertModel):
|
|
881 |
|
882 |
class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
|
883 |
|
|
|
|
|
884 |
def __init__(self, config):
|
885 |
|
886 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
@@ -895,7 +902,7 @@ class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
|
|
895 |
|
896 |
class LSGAlbertForMaskedLM(LSGAlbertPreTrainedModel, AlbertForMaskedLM):
|
897 |
|
898 |
-
|
899 |
|
900 |
def __init__(self, config):
|
901 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
@@ -925,8 +932,6 @@ class LSGAlbertForSequenceClassification(LSGAlbertPreTrainedModel, AlbertForSequ
|
|
925 |
|
926 |
class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenClassification):
|
927 |
|
928 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
929 |
-
|
930 |
def __init__(self, config):
|
931 |
|
932 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
@@ -947,8 +952,6 @@ class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenCl
|
|
947 |
|
948 |
class LSGAlbertForQuestionAnswering(LSGAlbertPreTrainedModel, AlbertForQuestionAnswering):
|
949 |
|
950 |
-
_keys_to_ignore_on_load_unexpected = [r"pooler"]
|
951 |
-
|
952 |
def __init__(self, config):
|
953 |
|
954 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
|
|
188 |
del key_layer
|
189 |
|
190 |
if attention_mask is not None:
|
|
|
|
|
|
|
191 |
# Add causal mask
|
192 |
causal_shape = (self.block_size, self.block_size) if causal_shape is None else causal_shape
|
193 |
causal_mask = torch.tril(
|
194 |
torch.ones(*causal_shape, device=attention_mask.device, dtype=attention_scores.dtype),
|
195 |
diagonal=-1
|
196 |
)
|
197 |
+
|
198 |
+
# Min value
|
199 |
+
dtype_min = torch.tensor(
|
200 |
+
torch.finfo(attention_scores.dtype).min, device=attention_scores.device, dtype=attention_scores.dtype
|
201 |
+
)
|
202 |
+
|
203 |
+
# Build causal + attention_mask
|
204 |
+
causal_mask = torch.nn.functional.pad(causal_mask.T * dtype_min, (attention_mask.size()[-1] - self.block_size, 0), value=0)
|
205 |
+
attention_mask = torch.max(attention_mask + causal_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0), dtype_min)
|
206 |
|
207 |
+
attention_scores = attention_scores + attention_mask
|
208 |
del attention_mask
|
209 |
+
del causal_mask
|
210 |
|
211 |
# Normalize the attention scores to probabilities.
|
212 |
attention_probs = nn.Softmax(dim=-1)(attention_scores)
|
|
|
844 |
config_class = LSGAlbertConfig
|
845 |
load_tf_weights = load_tf_weights_in_albert
|
846 |
base_model_prefix = "albert"
|
|
|
847 |
|
848 |
def _init_weights(self, module):
|
849 |
"""Initialize the weights."""
|
|
|
886 |
|
887 |
class LSGAlbertForPreTraining(LSGAlbertPreTrainedModel, AlbertForPreTraining):
|
888 |
|
889 |
+
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
|
890 |
+
|
891 |
def __init__(self, config):
|
892 |
|
893 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
|
|
902 |
|
903 |
class LSGAlbertForMaskedLM(LSGAlbertPreTrainedModel, AlbertForMaskedLM):
|
904 |
|
905 |
+
_tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"]
|
906 |
|
907 |
def __init__(self, config):
|
908 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
|
|
932 |
|
933 |
class LSGAlbertForTokenClassification(LSGAlbertPreTrainedModel, AlbertForTokenClassification):
|
934 |
|
|
|
|
|
935 |
def __init__(self, config):
|
936 |
|
937 |
LSGAlbertPreTrainedModel.__init__(self, config)
|
|
|
952 |
|
953 |
class LSGAlbertForQuestionAnswering(LSGAlbertPreTrainedModel, AlbertForQuestionAnswering):
|
954 |
|
|
|
|
|
955 |
def __init__(self, config):
|
956 |
|
957 |
LSGAlbertPreTrainedModel.__init__(self, config)
|