import os import torch from tokenizers import Tokenizer from TTS.tts.utils.text.cleaners import english_cleaners DEFAULT_VOCAB_FILE = os.path.join( os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json" ) class VoiceBpeTokenizer: def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None): self.tokenizer = None if vocab_file is not None: self.tokenizer = Tokenizer.from_file(vocab_file) if vocab_str is not None: self.tokenizer = Tokenizer.from_str(vocab_str) def preprocess_text(self, txt): txt = english_cleaners(txt) return txt def encode(self, txt): txt = self.preprocess_text(txt) txt = txt.replace(" ", "[SPACE]") return self.tokenizer.encode(txt).ids def decode(self, seq): if isinstance(seq, torch.Tensor): seq = seq.cpu().numpy() txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "") txt = txt.replace("[SPACE]", " ") txt = txt.replace("[STOP]", "") txt = txt.replace("[UNK]", "") return txt