oweller2
commited on
Commit
•
38e83eb
1
Parent(s):
340e438
update
Browse files- tokenizer.py +7 -8
- tokenizer_config.json +0 -1
tokenizer.py
CHANGED
@@ -7,14 +7,13 @@ class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
|
|
7 |
def _batch_encode_plus(self, *args, **kwargs):
|
8 |
outputs = super()._batch_encode_plus(*args, **kwargs)
|
9 |
del outputs["token_type_ids"]
|
10 |
-
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
# outputs[key] = [sequence[:-1] for sequence in outputs[key]]
|
18 |
return outputs
|
19 |
|
20 |
# Register the class
|
|
|
7 |
def _batch_encode_plus(self, *args, **kwargs):
|
8 |
outputs = super()._batch_encode_plus(*args, **kwargs)
|
9 |
del outputs["token_type_ids"]
|
10 |
+
for key in ['input_ids', 'attention_mask']:
|
11 |
+
if isinstance(outputs[key], torch.Tensor):
|
12 |
+
outputs[key] = outputs[key][..., :-1]
|
13 |
+
elif isinstance(outputs[key], numpy.ndarray):
|
14 |
+
outputs[key] = outputs[key][..., :-1]
|
15 |
+
elif isinstance(outputs[key], list):
|
16 |
+
outputs[key] = [sequence[:-1] for sequence in outputs[key]]
|
|
|
17 |
return outputs
|
18 |
|
19 |
# Register the class
|
tokenizer_config.json
CHANGED
@@ -938,7 +938,6 @@
|
|
938 |
"unk_token": "[UNK]",
|
939 |
"eos_token": "[SEP]",
|
940 |
"tokenizer_class": "ModernDecoderBERTTokenizer",
|
941 |
-
"truncation": "right",
|
942 |
"auto_map": {
|
943 |
"AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
|
944 |
"AutoTokenizer": [
|
|
|
938 |
"unk_token": "[UNK]",
|
939 |
"eos_token": "[SEP]",
|
940 |
"tokenizer_class": "ModernDecoderBERTTokenizer",
|
|
|
941 |
"auto_map": {
|
942 |
"AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
|
943 |
"AutoTokenizer": [
|