AlumiK commited on
Commit
30eca34
·
1 Parent(s): aed054e

update tokenizer

Browse files
Files changed (2) hide show
  1. tokenization_linglong_fast.py +2 -1
  2. tokenizer.json +7 -3
tokenization_linglong_fast.py CHANGED
@@ -74,8 +74,9 @@ class LingLongTokenizerFast(PreTrainedTokenizerFast):
74
  )
75
  backend_tokenizer.normalizer = normalizers.Sequence(normalizer_sequence)
76
  backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
77
- pre_tokenizers.WhitespaceSplit(),
78
  pre_tokenizers.Digits(individual_digits=True),
 
 
79
  ])
80
  super().__init__(
81
  tokenizer_file=tokenizer_file,
 
74
  )
75
  backend_tokenizer.normalizer = normalizers.Sequence(normalizer_sequence)
76
  backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
 
77
  pre_tokenizers.Digits(individual_digits=True),
78
+ pre_tokenizers.Punctuation(),
79
+ pre_tokenizers.WhitespaceSplit(),
80
  ])
81
  super().__init__(
82
  tokenizer_file=tokenizer_file,
tokenizer.json CHANGED
@@ -179,12 +179,16 @@
179
  "pre_tokenizer": {
180
  "type": "Sequence",
181
  "pretokenizers": [
182
- {
183
- "type": "WhitespaceSplit"
184
- },
185
  {
186
  "type": "Digits",
187
  "individual_digits": true
 
 
 
 
 
 
 
188
  }
189
  ]
190
  },
 
179
  "pre_tokenizer": {
180
  "type": "Sequence",
181
  "pretokenizers": [
 
 
 
182
  {
183
  "type": "Digits",
184
  "individual_digits": true
185
+ },
186
+ {
187
+ "type": "Punctuation",
188
+ "behavior": "Isolated"
189
+ },
190
+ {
191
+ "type": "WhitespaceSplit"
192
  }
193
  ]
194
  },