| | import torch
|
| | from phonemizer import phonemize
|
| | from phonemizer.separator import Separator
|
| |
|
| | class TextEncoder:
|
| | """
|
| | Handles text-to-phoneme conversion for 14 languages.
|
| | """
|
| | def __init__(self, vocab_map=None):
|
| | self.separator = Separator(phone=' ', word='|', syllable='')
|
| |
|
| | self.lang_map = {
|
| | 'en': 'en-us', 'zh': 'cmn', 'es': 'es', 'fr': 'fr-fr',
|
| | 'de': 'de', 'ja': 'ja', 'ko': 'ko', 'ru': 'ru',
|
| | 'pt': 'pt', 'it': 'it', 'hi': 'hi', 'ar': 'ar',
|
| | 'tr': 'tr', 'nl': 'nl', 'bn': 'bn'
|
| | }
|
| |
|
| | self.vocab = vocab_map if vocab_map else {c: i for i, c in enumerate(" abcdefghijklmnopqrstuvwxyz|")}
|
| |
|
| | def preprocess(self, text, lang_code='en'):
|
| | """
|
| | Converts text to phoneme IDs.
|
| | """
|
| | if lang_code not in self.lang_map:
|
| | print(f"Warning: Language {lang_code} not fully supported, defaulting to English backend.")
|
| | backend_lang = 'en-us'
|
| | else:
|
| | backend_lang = self.lang_map[lang_code]
|
| |
|
| | try:
|
| |
|
| | phonemes = phonemize(
|
| | text,
|
| | language=backend_lang,
|
| | backend='espeak',
|
| | separator=self.separator,
|
| | strip=True,
|
| | preserve_punctuation=True,
|
| | njobs=1
|
| | )
|
| | except RuntimeError:
|
| | print("Warning: eSpeak not found. Falling back to character-level tokenization.")
|
| | phonemes = list(text)
|
| |
|
| |
|
| | token_ids = [self.vocab.get(p, 0) for p in phonemes]
|
| | return torch.tensor(token_ids).unsqueeze(0)
|
| |
|