small fix
Browse files- README.md +1 -1
- modeling_lsg_roberta.py +20 -7
README.md
CHANGED
@@ -5,7 +5,7 @@ tags:
|
|
5 |
---
|
6 |
|
7 |
# LSG model
|
8 |
-
**Transformers >= 4.
|
9 |
**This model relies on a custom modeling file, you need to add trust_remote_code=True**\
|
10 |
**See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
|
11 |
|
|
|
5 |
---
|
6 |
|
7 |
# LSG model
|
8 |
+
**Transformers >= 4.36.1**\
|
9 |
**This model relies on a custom modeling file, you need to add trust_remote_code=True**\
|
10 |
**See [\#13467](https://github.com/huggingface/transformers/pull/13467)**
|
11 |
|
modeling_lsg_roberta.py
CHANGED
@@ -411,13 +411,11 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
|
|
411 |
|
412 |
def forward(
|
413 |
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
|
414 |
-
|
415 |
if position_ids is None:
|
416 |
if input_ids is not None:
|
417 |
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
418 |
-
position_ids = create_position_ids_from_input_ids(
|
419 |
-
input_ids, self.padding_idx, past_key_values_length
|
420 |
-
).to(input_ids.device)
|
421 |
else:
|
422 |
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
423 |
|
@@ -426,10 +424,18 @@ class LSGRobertaEmbeddings(RobertaEmbeddings):
|
|
426 |
else:
|
427 |
input_shape = inputs_embeds.size()[:-1]
|
428 |
|
429 |
-
seq_length = input_shape[
|
430 |
|
|
|
|
|
|
|
431 |
if token_type_ids is None:
|
432 |
-
|
|
|
|
|
|
|
|
|
|
|
433 |
|
434 |
if inputs_embeds is None:
|
435 |
inputs_embeds = self.word_embeddings(input_ids)
|
@@ -998,6 +1004,7 @@ class LSGRobertaEncoder(RobertaEncoder):
|
|
998 |
encoder_outputs.last_hidden_state = sequence_output
|
999 |
return encoder_outputs
|
1000 |
|
|
|
1001 |
class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
|
1002 |
"""
|
1003 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
@@ -1033,6 +1040,12 @@ class LSGRobertaModel(LSGRobertaPreTrainedModel, RobertaModel):
|
|
1033 |
"Cross attention is computed using full attention since it is not LSG compatible."
|
1034 |
)
|
1035 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1036 |
# Initialize weights and apply final processing
|
1037 |
self.post_init()
|
1038 |
|
@@ -1190,4 +1203,4 @@ try:
|
|
1190 |
str_to_class(value.split(".")[-1]).register_for_auto_class(key)
|
1191 |
except:
|
1192 |
warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
|
1193 |
-
warn("Update to transformers >= 4.
|
|
|
411 |
|
412 |
def forward(
|
413 |
self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
|
414 |
+
):
|
415 |
if position_ids is None:
|
416 |
if input_ids is not None:
|
417 |
# Create the position ids from the input token ids. Any padded tokens remain padded.
|
418 |
+
position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length)
|
|
|
|
|
419 |
else:
|
420 |
position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds)
|
421 |
|
|
|
424 |
else:
|
425 |
input_shape = inputs_embeds.size()[:-1]
|
426 |
|
427 |
+
seq_length = input_shape[1]
|
428 |
|
429 |
+
# Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
|
430 |
+
# when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
|
431 |
+
# issue #5664
|
432 |
if token_type_ids is None:
|
433 |
+
if hasattr(self, "token_type_ids"):
|
434 |
+
buffered_token_type_ids = self.token_type_ids[:, :seq_length]
|
435 |
+
buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
|
436 |
+
token_type_ids = buffered_token_type_ids_expanded
|
437 |
+
else:
|
438 |
+
token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
|
439 |
|
440 |
if inputs_embeds is None:
|
441 |
inputs_embeds = self.word_embeddings(input_ids)
|
|
|
1004 |
encoder_outputs.last_hidden_state = sequence_output
|
1005 |
return encoder_outputs
|
1006 |
|
1007 |
+
|
1008 |
class LSGRobertaPreTrainedModel(RobertaPreTrainedModel):
|
1009 |
"""
|
1010 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
|
|
|
1040 |
"Cross attention is computed using full attention since it is not LSG compatible."
|
1041 |
)
|
1042 |
|
1043 |
+
self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
|
1044 |
+
if self._use_flash_attention_2:
|
1045 |
+
logger.warning(
|
1046 |
+
"[WARNING flash-attention]: LSG doesnt support flash-attention currently"
|
1047 |
+
)
|
1048 |
+
|
1049 |
# Initialize weights and apply final processing
|
1050 |
self.post_init()
|
1051 |
|
|
|
1203 |
str_to_class(value.split(".")[-1]).register_for_auto_class(key)
|
1204 |
except:
|
1205 |
warn("AutoRegister isn't available, you'll have to manually copy modeling.py after .save_pretrained(...).")
|
1206 |
+
warn("Update to transformers >= 4.36.1 to fix.")
|