ccdv commited on
Commit
44bd267
1 Parent(s): 4d4354b
Files changed (2) hide show
  1. README.md +1 -1
  2. modeling_lsg_roberta.py +20 -7
README.md CHANGED
@@ -7,7 +7,7 @@ pipeline_tag: fill-mask
7
  ---
8
 
9
  # LSG model
10
- **Transformers >= 4.35.2**\
11
  **This model relies on a custom modeling file, you need to add trust_remote_code=True**\
12
  **See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
13
 
 
7
  ---
8
 
9
  # LSG model
10
+ **Transformers >= 4.36.1**\
11
  **This model relies on a custom modeling file, you need to add trust_remote_code=True**\
12
  **See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
13
 
modeling_lsg_roberta.py CHANGED
@@ -411,13 +411,11 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
411
 
412
  def forward(
413
  self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
414
- ):
415
  if position_ids is None:
416
  if input_ids is not None:
417
  # Create the position ids from the input token ids. Any padded tokens remain padded.
418
- position_ids = create_position_ids_from_input_ids(
419
- input_ids, self.padding_idx, past_key_values_length
420
- ).to(input_ids.device)
421
  else:
422
  position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
423
 
@@ -426,10 +424,18 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
426
  else:
427
  input_shape = inputs_embeds.size()[:-1]
428
 
429
- seq_length = input_shape[-1]
430
 
 
 
 
431
  if token_type_ids is None:
432
- token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
 
 
 
 
 
433
 
434
  if inputs_embeds is None:
435
  inputs_embeds = self.word_embeddings(input_ids)
@@ -998,6 +1004,7 @@ class LSGRobertaEncoder(RobertaEncoder):
998
  encoder_outputs.last_hidden_state = sequence_output
999
  return encoder_outputs
1000
 
 
1001
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
1002
  """
1003
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -1033,6 +1040,12 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
1033
  "Cross attention is computed using full attention since it is not LSG compatible."
1034
  )
1035
 
 
 
 
 
 
 
1036
  # Initialize weights and apply final processing
1037
  self.post_init()
1038
 
@@ -1190,4 +1203,4 @@ try:
1190
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1191
  except:
1192
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1193
- warn("Update to transformers >= 4.35.2 to fix.")
 
411
 
412
  def forward(
413
  self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
414
+ ):
415
  if position_ids is None:
416
  if input_ids is not None:
417
  # Create the position ids from the input token ids. Any padded tokens remain padded.
418
+ position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
 
 
419
  else:
420
  position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
421
 
 
424
  else:
425
  input_shape = inputs_embeds.size()[:-1]
426
 
427
+ seq_length = input_shape[1]
428
 
429
+ # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
430
+ # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
431
+ # issue #5664
432
  if token_type_ids is None:
433
+ if hasattr(self, "token_type_ids"):
434
+ buffered_token_type_ids = self.token_type_ids[:, :seq_length]
435
+ buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
436
+ token_type_ids = buffered_token_type_ids_expanded
437
+ else:
438
+ token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
439
 
440
  if inputs_embeds is None:
441
  inputs_embeds = self.word_embeddings(input_ids)
 
1004
  encoder_outputs.last_hidden_state = sequence_output
1005
  return encoder_outputs
1006
 
1007
+
1008
  class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
1009
  """
1010
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
 
1040
  "Cross attention is computed using full attention since it is not LSG compatible."
1041
  )
1042
 
1043
+ self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
1044
+ if self._use_flash_attention_2:
1045
+ logger.warning(
1046
+ "[WARNING flash-attention]: LSG doesnt support flash-attention currently"
1047
+ )
1048
+
1049
  # Initialize weights and apply final processing
1050
  self.post_init()
1051
 
 
1203
  str_to_class(value.split(".")[-1]).register_for_auto_class(key)
1204
  except:
1205
  warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
1206
+ warn("Update to transformers >= 4.36.1 to fix.")