Shadhil's picture
voice-clone with single audio sample input
9b2107c
raw
history blame
No virus
1.14 kB
import os
import torch
from tokenizers import Tokenizer
from TTS.tts.utils.text.cleaners import english_cleaners
DEFAULT_VOCAB_FILE = os.path.join(
os.path.dirname(os.path.realpath(__file__)), "../../utils/assets/tortoise/tokenizer.json"
)
class VoiceBpeTokenizer:
def __init__(self, vocab_file=DEFAULT_VOCAB_FILE, vocab_str=None):
self.tokenizer = None
if vocab_file is not None:
self.tokenizer = Tokenizer.from_file(vocab_file)
if vocab_str is not None:
self.tokenizer = Tokenizer.from_str(vocab_str)
def preprocess_text(self, txt):
txt = english_cleaners(txt)
return txt
def encode(self, txt):
txt = self.preprocess_text(txt)
txt = txt.replace(" ", "[SPACE]")
return self.tokenizer.encode(txt).ids
def decode(self, seq):
if isinstance(seq, torch.Tensor):
seq = seq.cpu().numpy()
txt = self.tokenizer.decode(seq, skip_special_tokens=False).replace(" ", "")
txt = txt.replace("[SPACE]", " ")
txt = txt.replace("[STOP]", "")
txt = txt.replace("[UNK]", "")
return txt