oweller2 commited on
Commit
38e83eb
1 Parent(s): 340e438
Files changed (2) hide show
  1. tokenizer.py +7 -8
  2. tokenizer_config.json +0 -1
tokenizer.py CHANGED
@@ -7,14 +7,13 @@ class ModernDecoderBERTTokenizer(PreTrainedTokenizerFast):
7
  def _batch_encode_plus(self, *args, **kwargs):
8
  outputs = super()._batch_encode_plus(*args, **kwargs)
9
  del outputs["token_type_ids"]
10
- # if the last token is eos, remove it
11
- # for key in ['input_ids', 'attention_mask']:
12
- # if isinstance(outputs[key], torch.Tensor):
13
- # outputs[key] = outputs[key][..., :-1]
14
- # elif isinstance(outputs[key], numpy.ndarray):
15
- # outputs[key] = outputs[key][..., :-1]
16
- # elif isinstance(outputs[key], list):
17
- # outputs[key] = [sequence[:-1] for sequence in outputs[key]]
18
  return outputs
19
 
20
  # Register the class
 
7
  def _batch_encode_plus(self, *args, **kwargs):
8
  outputs = super()._batch_encode_plus(*args, **kwargs)
9
  del outputs["token_type_ids"]
10
+ for key in ['input_ids', 'attention_mask']:
11
+ if isinstance(outputs[key], torch.Tensor):
12
+ outputs[key] = outputs[key][..., :-1]
13
+ elif isinstance(outputs[key], numpy.ndarray):
14
+ outputs[key] = outputs[key][..., :-1]
15
+ elif isinstance(outputs[key], list):
16
+ outputs[key] = [sequence[:-1] for sequence in outputs[key]]
 
17
  return outputs
18
 
19
  # Register the class
tokenizer_config.json CHANGED
@@ -938,7 +938,6 @@
938
  "unk_token": "[UNK]",
939
  "eos_token": "[SEP]",
940
  "tokenizer_class": "ModernDecoderBERTTokenizer",
941
- "truncation": "right",
942
  "auto_map": {
943
  "AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
944
  "AutoTokenizer": [
 
938
  "unk_token": "[UNK]",
939
  "eos_token": "[SEP]",
940
  "tokenizer_class": "ModernDecoderBERTTokenizer",
 
941
  "auto_map": {
942
  "AutoConfig": "orionweller/test-flex-gpt--configuration_bert.FlexBertConfig",
943
  "AutoTokenizer": [