ccdv commited on
Commit
f9e0ca0
1 Parent(s): fef817c
Files changed (2) hide show
  1. README.md +1 -1
  2. modeling_lsg_xlm_roberta.py +51 -38
README.md CHANGED
@@ -7,7 +7,7 @@ pipeline_tag: fill-mask
7
  ---
8
 
9
  # LSG model
10
- **Transformers >= 4.35.2**\
11
  **This model relies on a custom modeling file, you need to add trust_remote_code=True**\
12
  **See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
13
 
 
7
  ---
8
 
9
  # LSG model
10
+ **Transformers >= 4.36.1**\
11
  **This model relies on a custom modeling file, you need to add trust_remote_code=True**\
12
  **See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
13
 
modeling_lsg_xlm_roberta.py CHANGED
@@ -1,5 +1,5 @@
1
  from logging import warn
2
- from transformers.models.roberta.modeling_roberta import *
3
  import torch
4
  import torch.nn as nn
5
  from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
@@ -153,7 +153,7 @@ class BaseAttentionProduct(nn.Module):
153
  del key_layer
154
 
155
  if attention_mask is not None:
156
- # Apply the attention mask is (precomputed for all layers in RobertaModel forward() function)
157
  attention_scores = attention_scores + attention_mask
158
  del attention_mask
159
 
@@ -397,7 +397,7 @@ class LSGAttentionProduct(nn.Module):
397
  return x.reshape(*x.size()[:-2], n_blocks, -1, d)
398
 
399
 
400
- class LSGRobertaEmbeddings(RobertaEmbeddings):
401
 
402
  def __init__(self, config):
403
  super().__init__(config)
@@ -411,13 +411,11 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
411
 
412
  def forward(
413
  self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
414
- ):
415
  if position_ids is None:
416
  if input_ids is not None:
417
  # Create the position ids from the input token ids. Any padded tokens remain padded.
418
- position_ids = create_position_ids_from_input_ids(
419
- input_ids, self.padding_idx, past_key_values_length
420
- ).to(input_ids.device)
421
  else:
422
  position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
423
 
@@ -426,10 +424,18 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
426
  else:
427
  input_shape = inputs_embeds.size()[:-1]
428
 
429
- seq_length = input_shape[-1]
430
 
 
 
 
431
  if token_type_ids is None:
432
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
 
 
 
 
433
 
434
  if inputs_embeds is None:
435
  inputs_embeds = self.word_embeddings(input_ids)
@@ -453,7 +459,7 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
453
  return embeddings
454
 
455
 
456
- class LSGAttention(RobertaAttention):
457
 
458
  def __init__(self, config):
459
 
@@ -912,7 +918,7 @@ class LSGSelfAttention(BaseSelfAttention):
912
  return x.reshape(n, h, -1, chunk_size, d)
913
 
914
 
915
- class LSGRobertaLayer(RobertaLayer):
916
 
917
  def __init__(self, config):
918
 
@@ -924,12 +930,12 @@ class LSGRobertaLayer(RobertaLayer):
924
  self.crossattention = LSGAttention(config)
925
 
926
 
927
- class LSGRobertaEncoder(RobertaEncoder):
928
 
929
  def __init__(self, config):
930
 
931
  super().__init__(config)
932
- self.layer = nn.ModuleList([LSGRobertaLayer(config) for _ in range(config.num_hidden_layers)])
933
 
934
  assert hasattr(config, "num_global_tokens")
935
  self.num_global_tokens = config.num_global_tokens
@@ -997,7 +1003,8 @@ class LSGRobertaEncoder(RobertaEncoder):
997
  encoder_outputs.last_hidden_state = sequence_output
998
  return encoder_outputs
999
 
1000
- class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
 
1001
  """
1002
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
1003
  models.
@@ -1009,11 +1016,11 @@ class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
1009
  _no_split_modules = []
1010
 
1011
  def _set_gradient_checkpointing(self, module, value=False):
1012
- if isinstance(module, (RobertaEncoder, LSGRobertaEncoder)):
1013
  module.gradient_checkpointing = value
1014
 
1015
 
1016
- class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1017
  """
1018
  This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
1019
  documentation alongside usage examples.
@@ -1021,17 +1028,23 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1021
 
1022
  def __init__(self, config, add_pooling_layer=True):
1023
 
1024
- LSGRobertaPreTrainedModel.__init__(self, config)
1025
 
1026
- self.embeddings = LSGRobertaEmbeddings(config)
1027
- self.encoder = LSGRobertaEncoder(config)
1028
- self.pooler = RobertaPooler(config) if add_pooling_layer else None
1029
 
1030
  if config.add_cross_attention:
1031
  logger.warning(
1032
  "Cross attention is computed using full attention since it is not LSG compatible."
1033
  )
1034
 
 
 
 
 
 
 
1035
  # Initialize weights and apply final processing
1036
  self.post_init()
1037
 
@@ -1053,25 +1066,25 @@ class LSGXLMRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1053
  return extended_attention_mask
1054
 
1055
 
1056
- class LSGXLMRobertaForCausalLM(LSGRobertaPreTrainedModel, RobertaForCausalLM):
1057
 
1058
  _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1059
 
1060
  def __init__(self, config):
1061
 
1062
- LSGRobertaPreTrainedModel.__init__(self, config)
1063
 
1064
  if not config.is_decoder:
1065
- logger.warning("If you want to use `LSGRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1066
 
1067
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1068
- self.lm_head = RobertaLMHead(config)
1069
 
1070
  # Initialize weights and apply final processing
1071
  self.post_init()
1072
 
1073
 
1074
- class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1075
  """
1076
  This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
1077
  documentation alongside usage examples.
@@ -1084,22 +1097,22 @@ class LSGXLMRobertaForMaskedLM(LSGRobertaPreTrainedModel, RobertaForMaskedLM):
1084
 
1085
  def __init__(self, config):
1086
 
1087
- LSGRobertaPreTrainedModel.__init__(self, config)
1088
 
1089
  if config.is_decoder:
1090
  logger.warning(
1091
- "If you want to use `LSGRobertaForMaskedLM` make sure `config.is_decoder=False` for "
1092
  "bi-directional self-attention."
1093
  )
1094
 
1095
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1096
- self.lm_head = RobertaLMHead(config)
1097
 
1098
  # Initialize weights and apply final processing
1099
  self.post_init()
1100
 
1101
 
1102
- class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaForSequenceClassification):
1103
  """
1104
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
1105
  appropriate documentation alongside usage examples.
@@ -1107,19 +1120,19 @@ class LSGXLMRobertaForSequenceClassification(LSGRobertaPreTrainedModel, RobertaF
1107
 
1108
  def __init__(self, config):
1109
 
1110
- LSGRobertaPreTrainedModel.__init__(self, config)
1111
 
1112
  self.num_labels = config.num_labels
1113
  self.config = config
1114
 
1115
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1116
- self.classifier = RobertaClassificationHead(config)
1117
 
1118
  # Initialize weights and apply final processing
1119
  self.post_init()
1120
 
1121
 
1122
- class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultipleChoice):
1123
  """
1124
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
1125
  appropriate documentation alongside usage examples.
@@ -1129,7 +1142,7 @@ class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultip
1129
 
1130
  def __init__(self, config):
1131
 
1132
- LSGRobertaPreTrainedModel.__init__(self, config)
1133
 
1134
  self.roberta = LSGXLMRobertaModel(config)
1135
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
@@ -1139,7 +1152,7 @@ class LSGXLMRobertaForMultipleChoice(LSGRobertaPreTrainedModel, RobertaForMultip
1139
  self.post_init()
1140
 
1141
 
1142
- class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForTokenClassification):
1143
  """
1144
  This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
1145
  appropriate documentation alongside usage examples.
@@ -1147,7 +1160,7 @@ class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForT
1147
 
1148
  def __init__(self, config):
1149
 
1150
- LSGRobertaPreTrainedModel.__init__(self, config)
1151
 
1152
  self.num_labels = config.num_labels
1153
 
@@ -1162,7 +1175,7 @@ class LSGXLMRobertaForTokenClassification(LSGRobertaPreTrainedModel, RobertaForT
1162
  self.post_init()
1163
 
1164
 
1165
- class LSGXLMRobertaForQuestionAnswering(LSGRobertaPreTrainedModel, RobertaForQuestionAnswering):
1166
  """
1167
  This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
1168
  appropriate documentation alongside usage examples.
@@ -1170,7 +1183,7 @@ class LSGXLMRobertaForQuestionAnswering(LSGRobertaPreTrainedModel, RobertaForQue
1170
 
1171
  def __init__(self, config):
1172
 
1173
- LSGRobertaPreTrainedModel.__init__(self, config)
1174
 
1175
  self.num_labels = config.num_labels
1176
 
@@ -1191,4 +1204,4 @@ try:
1191
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1192
  except:
1193
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1194
- warn("Update to transformers >= 4.35.2 to fix.")
 
1
  from logging import warn
2
+ from transformers.models.xlm_roberta.modeling_xlm_roberta import *
3
  import torch
4
  import torch.nn as nn
5
  from transformers.models.xlm_roberta.configuration_xlm_roberta import XLMRobertaConfig
 
153
  del key_layer
154
 
155
  if attention_mask is not None:
156
+ # Apply the attention mask is (precomputed for all layers in XLMRobertaModel forward() function)
157
  attention_scores = attention_scores + attention_mask
158
  del attention_mask
159
 
 
397
  return x.reshape(*x.size()[:-2], n_blocks, -1, d)
398
 
399
 
400
+ class LSGXLMRobertaEmbeddings(XLMRobertaEmbeddings):
401
 
402
  def __init__(self, config):
403
  super().__init__(config)
 
411
 
412
  def forward(
413
  self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
414
+ ):
415
  if position_ids is None:
416
  if input_ids is not None:
417
  # Create the position ids from the input token ids. Any padded tokens remain padded.
418
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
 
 
419
  else:
420
  position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
421
 
 
424
  else:
425
  input_shape = inputs_embeds.size()[:-1]
426
 
427
+ seq_length = input_shape[1]
428
 
429
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
430
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
431
+ # issue #5664
432
  if token_type_ids is None:
433
+ if hasattr(self, "token_type_ids"):
434
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
435
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
436
+ token_type_ids = buffered_token_type_ids_expanded
437
+ else:
438
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
439
 
440
  if inputs_embeds is None:
441
  inputs_embeds = self.word_embeddings(input_ids)
 
459
  return embeddings
460
 
461
 
462
+ class LSGAttention(XLMRobertaAttention):
463
 
464
  def __init__(self, config):
465
 
 
918
  return x.reshape(n, h, -1, chunk_size, d)
919
 
920
 
921
+ class LSGXLMRobertaLayer(XLMRobertaLayer):
922
 
923
  def __init__(self, config):
924
 
 
930
  self.crossattention = LSGAttention(config)
931
 
932
 
933
+ class LSGXLMRobertaEncoder(XLMRobertaEncoder):
934
 
935
  def __init__(self, config):
936
 
937
  super().__init__(config)
938
+ self.layer = nn.ModuleList([LSGXLMRobertaLayer(config) for _ in range(config.num_hidden_layers)])
939
 
940
  assert hasattr(config, "num_global_tokens")
941
  self.num_global_tokens = config.num_global_tokens
 
1003
  encoder_outputs.last_hidden_state = sequence_output
1004
  return encoder_outputs
1005
 
1006
+
1007
+ class LSGXLMRobertaPreTrainedModel(XLMRobertaPreTrainedModel):
1008
  """
1009
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
1010
  models.
 
1016
  _no_split_modules = []
1017
 
1018
  def _set_gradient_checkpointing(self, module, value=False):
1019
+ if isinstance(module, (XLMRobertaEncoder, LSGXLMRobertaEncoder)):
1020
  module.gradient_checkpointing = value
1021
 
1022
 
1023
+ class LSGXLMRobertaModel(LSGXLMRobertaPreTrainedModel, XLMRobertaModel):
1024
  """
1025
  This class overrides :class:`~transformers.RobertaModel`. Please check the superclass for the appropriate
1026
  documentation alongside usage examples.
 
1028
 
1029
  def __init__(self, config, add_pooling_layer=True):
1030
 
1031
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1032
 
1033
+ self.embeddings = LSGXLMRobertaEmbeddings(config)
1034
+ self.encoder = LSGXLMRobertaEncoder(config)
1035
+ self.pooler = XLMRobertaPooler(config) if add_pooling_layer else None
1036
 
1037
  if config.add_cross_attention:
1038
  logger.warning(
1039
  "Cross attention is computed using full attention since it is not LSG compatible."
1040
  )
1041
 
1042
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1043
+ if self._use_flash_attention_2:
1044
+ logger.warning(
1045
+ "[WARNING flash-attention]: LSG doesnt support flash-attention currently"
1046
+ )
1047
+
1048
  # Initialize weights and apply final processing
1049
  self.post_init()
1050
 
 
1066
  return extended_attention_mask
1067
 
1068
 
1069
+ class LSGXLMRobertaForCausalLM(LSGXLMRobertaPreTrainedModel, XLMRobertaForCausalLM):
1070
 
1071
  _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"]
1072
 
1073
  def __init__(self, config):
1074
 
1075
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1076
 
1077
  if not config.is_decoder:
1078
+ logger.warning("If you want to use `LSGXLMRobertaLMHeadModel` as a standalone, add `is_decoder=True.`")
1079
 
1080
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1081
+ self.lm_head = XLMRobertaLMHead(config)
1082
 
1083
  # Initialize weights and apply final processing
1084
  self.post_init()
1085
 
1086
 
1087
+ class LSGXLMRobertaForMaskedLM(LSGXLMRobertaPreTrainedModel, XLMRobertaForMaskedLM):
1088
  """
1089
  This class overrides :class:`~transformers.RobertaForMaskedLM`. Please check the superclass for the appropriate
1090
  documentation alongside usage examples.
 
1097
 
1098
  def __init__(self, config):
1099
 
1100
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1101
 
1102
  if config.is_decoder:
1103
  logger.warning(
1104
+ "If you want to use `LSGXLMRobertaForMaskedLM` make sure `config.is_decoder=False` for "
1105
  "bi-directional self-attention."
1106
  )
1107
 
1108
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1109
+ self.lm_head = XLMRobertaLMHead(config)
1110
 
1111
  # Initialize weights and apply final processing
1112
  self.post_init()
1113
 
1114
 
1115
+ class LSGXLMRobertaForSequenceClassification(LSGXLMRobertaPreTrainedModel, XLMRobertaForSequenceClassification):
1116
  """
1117
  This class overrides :class:`~transformers.RobertaForSequenceClassification`. Please check the superclass for the
1118
  appropriate documentation alongside usage examples.
 
1120
 
1121
  def __init__(self, config):
1122
 
1123
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1124
 
1125
  self.num_labels = config.num_labels
1126
  self.config = config
1127
 
1128
  self.roberta = LSGXLMRobertaModel(config, add_pooling_layer=False)
1129
+ self.classifier = XLMRobertaClassificationHead(config)
1130
 
1131
  # Initialize weights and apply final processing
1132
  self.post_init()
1133
 
1134
 
1135
+ class LSGXLMRobertaForMultipleChoice(LSGXLMRobertaPreTrainedModel, XLMRobertaForMultipleChoice):
1136
  """
1137
  This class overrides :class:`~transformers.RobertaForMultipleChoice`. Please check the superclass for the
1138
  appropriate documentation alongside usage examples.
 
1142
 
1143
  def __init__(self, config):
1144
 
1145
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1146
 
1147
  self.roberta = LSGXLMRobertaModel(config)
1148
  self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
1152
  self.post_init()
1153
 
1154
 
1155
+ class LSGXLMRobertaForTokenClassification(LSGXLMRobertaPreTrainedModel, XLMRobertaForTokenClassification):
1156
  """
1157
  This class overrides :class:`~transformers.RobertaForTokenClassification`. Please check the superclass for the
1158
  appropriate documentation alongside usage examples.
 
1160
 
1161
  def __init__(self, config):
1162
 
1163
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1164
 
1165
  self.num_labels = config.num_labels
1166
 
 
1175
  self.post_init()
1176
 
1177
 
1178
+ class LSGXLMRobertaForQuestionAnswering(LSGXLMRobertaPreTrainedModel, XLMRobertaForQuestionAnswering):
1179
  """
1180
  This class overrides :class:`~transformers.RobertaForQuestionAnswering`. Please check the superclass for the
1181
  appropriate documentation alongside usage examples.
 
1183
 
1184
  def __init__(self, config):
1185
 
1186
+ LSGXLMRobertaPreTrainedModel.__init__(self, config)
1187
 
1188
  self.num_labels = config.num_labels
1189
 
 
1204
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1205
  except:
1206
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1207
+ warn("Update to transformers >= 4.36.1 to fix.")