Spaces:
Running
on
Zero
Running
on
Zero
Normalize Polish text to NFC before tokenization
Browse files
src/chatterbox/models/tokenizers/tokenizer.py
CHANGED
@@ -306,6 +306,10 @@ class MTLTokenizer:
|
|
306 |
txt = korean_normalize(txt)
|
307 |
elif language_id == 'ru':
|
308 |
txt = self.russian_stress_labeler(txt)
|
|
|
|
|
|
|
|
|
309 |
|
310 |
# Prepend language token
|
311 |
if language_id:
|
|
|
306 |
txt = korean_normalize(txt)
|
307 |
elif language_id == 'ru':
|
308 |
txt = self.russian_stress_labeler(txt)
|
309 |
+
elif language_id == 'pl':
|
310 |
+
# Polish text normalization: ensure diacritic characters are preserved
|
311 |
+
import unicodedata
|
312 |
+
txt = unicodedata.normalize('NFC', txt)
|
313 |
|
314 |
# Prepend language token
|
315 |
if language_id:
|