Zihan428 commited on
Commit
364e836
·
1 Parent(s): 4a71cd1

Normalize Polish text to NFC before tokenization

Browse files
src/chatterbox/models/tokenizers/tokenizer.py CHANGED
@@ -306,6 +306,10 @@ class MTLTokenizer:
306
  txt = korean_normalize(txt)
307
  elif language_id == 'ru':
308
  txt = self.russian_stress_labeler(txt)
 
 
 
 
309
 
310
  # Prepend language token
311
  if language_id:
 
306
  txt = korean_normalize(txt)
307
  elif language_id == 'ru':
308
  txt = self.russian_stress_labeler(txt)
309
+ elif language_id == 'pl':
310
+ # Polish text normalization: ensure diacritic characters are preserved
311
+ import unicodedata
312
+ txt = unicodedata.normalize('NFC', txt)
313
 
314
  # Prepend language token
315
  if language_id: