update tokenizer
Browse files- tokenization_linglong_fast.py +2 -1
- tokenizer.json +7 -3
tokenization_linglong_fast.py
CHANGED
@@ -74,8 +74,9 @@ class LingLongTokenizerFast(PreTrainedTokenizerFast):
|
|
74 |
)
|
75 |
backend_tokenizer.normalizer = normalizers.Sequence(normalizer_sequence)
|
76 |
backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
77 |
-
pre_tokenizers.WhitespaceSplit(),
|
78 |
pre_tokenizers.Digits(individual_digits=True),
|
|
|
|
|
79 |
])
|
80 |
super().__init__(
|
81 |
tokenizer_file=tokenizer_file,
|
|
|
74 |
)
|
75 |
backend_tokenizer.normalizer = normalizers.Sequence(normalizer_sequence)
|
76 |
backend_tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
|
|
77 |
pre_tokenizers.Digits(individual_digits=True),
|
78 |
+
pre_tokenizers.Punctuation(),
|
79 |
+
pre_tokenizers.WhitespaceSplit(),
|
80 |
])
|
81 |
super().__init__(
|
82 |
tokenizer_file=tokenizer_file,
|
tokenizer.json
CHANGED
@@ -179,12 +179,16 @@
|
|
179 |
"pre_tokenizer": {
|
180 |
"type": "Sequence",
|
181 |
"pretokenizers": [
|
182 |
-
{
|
183 |
-
"type": "WhitespaceSplit"
|
184 |
-
},
|
185 |
{
|
186 |
"type": "Digits",
|
187 |
"individual_digits": true
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
188 |
}
|
189 |
]
|
190 |
},
|
|
|
179 |
"pre_tokenizer": {
|
180 |
"type": "Sequence",
|
181 |
"pretokenizers": [
|
|
|
|
|
|
|
182 |
{
|
183 |
"type": "Digits",
|
184 |
"individual_digits": true
|
185 |
+
},
|
186 |
+
{
|
187 |
+
"type": "Punctuation",
|
188 |
+
"behavior": "Isolated"
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"type": "WhitespaceSplit"
|
192 |
}
|
193 |
]
|
194 |
},
|