|
import gradio as gr |
|
import argparse |
|
import json |
|
import datetime as dt |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
import gradio as gr |
|
import torch |
|
from pydub import AudioSegment |
|
from model.classifier import SpecClassifier |
|
from torch.utils.data import DataLoader |
|
from text import text_to_sequence, cmudict |
|
from text.symbols import symbols |
|
import utils_data as utils |
|
from utils import load_checkpoint_no_logger |
|
from kaldiio import WriteHelper |
|
import os |
|
from tqdm import tqdm |
|
from text import text_to_sequence, convert_text |
|
import sys |
|
from model import GradTTSXvector, GradTTSWithEmo |
|
import IPython.display as ipd |
|
|
|
device = ('cuda' if torch.cuda.is_available() else 'cpu') |
|
device |
|
|
|
hps, args = utils.get_hparams_decode_two_mixture() |
|
|
|
gradtts_uncond_model = GradTTSWithEmo |
|
|
|
gradtts_uncond_model = gradtts_uncond_model(**hps.model).to(device) |
|
model = SpecClassifier( |
|
in_dim=hps.data.n_mel_channels, |
|
d_decoder=hps.model.d_decoder, |
|
h_decoder=hps.model.h_decoder, |
|
l_decoder=hps.model.l_decoder, |
|
k_decoder=hps.model.k_decoder, |
|
decoder_dropout=hps.model.decoder_dropout, |
|
n_class=hps.model.n_emos, |
|
cond_dim=hps.data.n_mel_channels, |
|
model_type=getattr(hps.model, "classifier_type", "CNN-with-time") |
|
) |
|
|
|
ckpt = './cnnwt_SGD_1959.pt' |
|
ckpt_tts = './grad_uncond_cnn_001.pt' |
|
|
|
utils.load_checkpoints_no_logger(ckpt_tts, gradtts_uncond_model, None) |
|
utils.load_checkpoints_no_logger(ckpt, model, None) |
|
|
|
_ = model.to(device).eval() |
|
|
|
HIFIGAN_CONFIG = './config.json' |
|
HIFIGAN_CHECKPT = './g_01720000' |
|
|
|
from models import Generator as HiFiGAN |
|
from env import AttrDict |
|
print('Initializing HiFi-GAN...') |
|
with open(HIFIGAN_CONFIG) as f: |
|
h = AttrDict(json.load(f)) |
|
vocoder = HiFiGAN(h) |
|
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) |
|
_ = vocoder.to(device).eval() |
|
vocoder.remove_weight_norm() |
|
|
|
def generate_audio(text, quantity, speaker, emotion_1, emotion_2): |
|
x, x_lengths = convert_text(text) |
|
emo_1, emo_2 = emotion_1, emotion_2 |
|
emo1 = torch.LongTensor([emo_1]).to(device) |
|
emo2 = torch.LongTensor([emo_2]).to(device) |
|
sid = torch.LongTensor([spekears.index(speaker)]).to(device) |
|
intensity = quantity / 100 |
|
|
|
y_enc, y_dec, attn = gradtts_uncond_model.classifier_guidance_decode_two_mixture( |
|
x, x_lengths, |
|
n_timesteps=10, |
|
temperature=2.0, |
|
stoc=args.stoc, |
|
spk=sid, |
|
emo1=emo1, |
|
emo2=emo2, |
|
emo1_weight=intensity, |
|
length_scale=1., |
|
classifier_func=model.forward, |
|
guidance=300, |
|
classifier_type=model.model_type |
|
) |
|
y_dec = y_dec.detach() |
|
|
|
res = y_dec.squeeze().cpu().numpy() |
|
x = torch.from_numpy(res).cuda().unsqueeze(0) |
|
y_g_hat = vocoder(x) |
|
audio = y_g_hat.squeeze() |
|
audio = audio * 32768.0 |
|
audio = audio.detach().cpu().numpy().astype('int16') |
|
sr = 22050 |
|
return sr, audio |
|
|
|
|
|
|
|
|
|
emotions = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) |
|
spekears = ['Madi', 'Marzhan', 'Akzhol'] |
|
|
|
demo = gr.Interface( |
|
generate_audio, |
|
[ |
|
gr.Textbox(value='Сәлем', label="Text you want to synthesize"), |
|
gr.Slider(0, 100, value=0, step=10, label="Count", info="Choose between 0 and 100"), |
|
gr.Dropdown(spekears, value=spekears[1], label="Narrator", info="Select a narrator." |
|
), |
|
gr.Dropdown(emotions, label="Emotion 1", info="Select first emotion"), |
|
gr.Dropdown(emotions, value=emotions[3], label="Emotion 2", info="Select second emotion." |
|
), |
|
], |
|
"audio" |
|
) |
|
|
|
demo.launch() |