oweller2
commited on
Commit
·
e9e8f85
1
Parent(s):
46797c8
minor
Browse files- config.json +3 -3
- modeling_flexbert.py +3 -2
config.json
CHANGED
@@ -69,9 +69,9 @@
|
|
69 |
"num_attention_heads": 12,
|
70 |
"num_hidden_layers": 22,
|
71 |
"num_initial_layers": 1,
|
72 |
-
"pad_logits":
|
73 |
"pad_token_id": 50283,
|
74 |
-
"padding": "
|
75 |
"pooling_type": "cls",
|
76 |
"position_embedding_type": "absolute",
|
77 |
"rotary_emb_base": 10000.0,
|
@@ -82,7 +82,7 @@
|
|
82 |
"sliding_window": 128,
|
83 |
"transformers_version": "4.44.1",
|
84 |
"type_vocab_size": 2,
|
85 |
-
"unpad_embeddings":
|
86 |
"use_cache": true,
|
87 |
"use_fa2": true,
|
88 |
"use_sdpa_attn_mask": false,
|
|
|
69 |
"num_attention_heads": 12,
|
70 |
"num_hidden_layers": 22,
|
71 |
"num_initial_layers": 1,
|
72 |
+
"pad_logits": true,
|
73 |
"pad_token_id": 50283,
|
74 |
+
"padding": "unpadded",
|
75 |
"pooling_type": "cls",
|
76 |
"position_embedding_type": "absolute",
|
77 |
"rotary_emb_base": 10000.0,
|
|
|
82 |
"sliding_window": 128,
|
83 |
"transformers_version": "4.44.1",
|
84 |
"type_vocab_size": 2,
|
85 |
+
"unpad_embeddings": true,
|
86 |
"use_cache": true,
|
87 |
"use_fa2": true,
|
88 |
"use_sdpa_attn_mask": false,
|
modeling_flexbert.py
CHANGED
@@ -935,6 +935,7 @@ class FlexBertModel(FlexBertPreTrainedModel):
|
|
935 |
else:
|
936 |
self.final_norm = None
|
937 |
self.unpad_embeddings = config.unpad_embeddings
|
|
|
938 |
|
939 |
def post_init(self):
|
940 |
self._init_weights(reset_params=False)
|
@@ -956,7 +957,7 @@ class FlexBertModel(FlexBertPreTrainedModel):
|
|
956 |
max_seqlen: Optional[int] = None,
|
957 |
**kwargs,
|
958 |
) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
|
959 |
-
if attention_mask is None:
|
960 |
attention_mask = torch.ones_like(input_ids)
|
961 |
|
962 |
embedding_output = self.embeddings(input_ids, position_ids)
|
@@ -1529,7 +1530,7 @@ class FlexBertForCausalLM(FlexBertPreTrainedModel):
|
|
1529 |
self.unpad_embeddings = config.unpad_embeddings
|
1530 |
self.pad_logits = config.pad_logits
|
1531 |
self.compile_model = config.compile_model
|
1532 |
-
|
1533 |
|
1534 |
# Initialize weights and apply final processing
|
1535 |
self._init_weights(reset_params=False)
|
|
|
935 |
else:
|
936 |
self.final_norm = None
|
937 |
self.unpad_embeddings = config.unpad_embeddings
|
938 |
+
self.is_decoder = config.causal_mask
|
939 |
|
940 |
def post_init(self):
|
941 |
self._init_weights(reset_params=False)
|
|
|
957 |
max_seqlen: Optional[int] = None,
|
958 |
**kwargs,
|
959 |
) -> Tuple[Union[List[torch.Tensor], torch.Tensor], Optional[torch.Tensor]]:
|
960 |
+
if attention_mask is None and not self.is_decoder:
|
961 |
attention_mask = torch.ones_like(input_ids)
|
962 |
|
963 |
embedding_output = self.embeddings(input_ids, position_ids)
|
|
|
1530 |
self.unpad_embeddings = config.unpad_embeddings
|
1531 |
self.pad_logits = config.pad_logits
|
1532 |
self.compile_model = config.compile_model
|
1533 |
+
self.masked_prediction = config.masked_prediction
|
1534 |
|
1535 |
# Initialize weights and apply final processing
|
1536 |
self._init_weights(reset_params=False)
|