Spaces:
Sleeping
Sleeping
import gradio as gr | |
from tacotron2_gst_original.hyper_parameters import tacotron_params as hparams | |
from tacotron2_gst_original.training import load_model | |
from tacotron2_gst_original.text import text_to_sequence | |
from melgan.model.generator import Generator | |
from melgan.utils.hparams import load_hparam | |
torch.manual_seed(1234) | |
MAX_WAV_VALUE = 32768.0 | |
def init_models(hparams): | |
# load trained tacotron2 + GST model: | |
model = load_model(hparams) | |
checkpoint_path = "trained_models/checkpoint_78000.model" | |
model.load_state_dict(torch.load(checkpoint_path)['state_dict']) | |
model.to('cuda') | |
_ = model.eval() | |
# load pre trained MelGAN model for mel2audio: | |
vocoder_checkpoint_path = "trained_models/nvidia_tacotron2_LJ11_epoch6400.pt" | |
checkpoint = torch.load(vocoder_checkpoint_path) | |
hp_melgan = load_hparam("melgan/config/default.yaml") | |
vocoder_model = Generator(80) | |
vocoder_model.load_state_dict(checkpoint['model_g']) | |
vocoder_model = vocoder_model.to('cuda') | |
vocoder_model.eval(inference=False) | |
def synthesize(text): | |
sequence = np.array(text_to_sequence(text, ['english_cleaners']))[None, :] | |
sequence = torch.from_numpy(sequence).to(device='cuda', dtype=torch.int64) | |
gst_head_scores = np.array([0.5, 0.15, 0.35]) # originally ([0.5, 0.15, 0.35]) | |
gst_scores = torch.from_numpy(gst_head_scores).cuda().float() | |
mel_outputs, mel_outputs_postnet, _, alignments = model.inference(sequence, gst_scores) | |
# mel2wav inference: | |
with torch.no_grad(): | |
audio = vocoder_model.inference(mel_outputs_postnet) | |
audio_numpy = audio.data.cpu().detach().numpy() | |
return (22050, audio_numpy) | |
init_models(hparams) | |
iface = gr.Interface(fn=synthesize, inputs="text", outputs=[gr.Audio(label="Generated Speech", type="numpy"),]) | |
iface.launch() | |