|
import argparse |
|
import json |
|
import datetime as dt |
|
import numpy as np |
|
from scipy.io.wavfile import write |
|
import IPython.display as ipd |
|
import glob |
|
import torch |
|
from pydub import AudioSegment |
|
from torch.utils.data import DataLoader |
|
from text import text_to_sequence, cmudict |
|
from text.symbols import symbols |
|
import utils_data |
|
import re |
|
from num2words import num2words |
|
from kaldiio import WriteHelper |
|
import os |
|
from tqdm import tqdm |
|
from text import text_to_sequence, convert_text |
|
from model import GradTTSWithEmo |
|
import utils_data as utils |
|
from attrdict import AttrDict |
|
from models import Generator as HiFiGAN |
|
|
|
|
|
HIFIGAN_CONFIG = './configs/hifigan-config.json' |
|
HIFIGAN_CHECKPT = './checkpts/hifigan.pt' |
|
|
|
|
|
if __name__ == '__main__': |
|
hps, args = utils.get_hparams_decode() |
|
device = torch.device('cpu' if not torch.cuda.is_available() else "cuda") |
|
ckpt = utils_data.latest_checkpoint_path(hps.model_dir, "EMA_grad_*.pt") |
|
print(ckpt) |
|
model = GradTTSWithEmo(**hps.model).to(device) |
|
logger = utils_data.get_logger(hps.model_dir, "inference.log") |
|
utils_data.load_checkpoint(ckpt, model, None) |
|
_ = model.cuda().eval() |
|
|
|
print('Initializing HiFi-GAN...') |
|
with open(HIFIGAN_CONFIG) as f: |
|
h = AttrDict(json.load(f)) |
|
vocoder = HiFiGAN(h) |
|
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) |
|
_ = vocoder.cuda().eval() |
|
vocoder.remove_weight_norm() |
|
|
|
emos = sorted(["angry", "surprise", "fear", "happy", "neutral", "sad"]) |
|
speakers = ['M1', 'F1', 'M2'] |
|
|
|
with open(args.file, 'r', encoding='utf-8') as f: |
|
texts = [line.strip() for line in f.readlines()] |
|
|
|
replace_nums = [] |
|
for i in texts: |
|
replace_nums.append(i.split('|', 1)) |
|
|
|
nums2word = [re.sub('(\d+)', lambda m: num2words(m.group(), lang='kz'), sentence) for sentence in np.array(replace_nums)[:, 0]] |
|
|
|
|
|
|
|
|
|
text2speech = [] |
|
for i, j in zip(nums2word, np.array(replace_nums)[:, 1]): |
|
text2speech.append(f'{i}|{j}') |
|
|
|
for i, line in enumerate(text2speech): |
|
emo_i = int(line.split('|')[1]) |
|
control_spk_id = int(line.split('|')[2]) |
|
control_emo_id = emos.index(emos[emo_i]) |
|
text = line.split('|')[0] |
|
with torch.no_grad(): |
|
|
|
emo = torch.LongTensor([control_emo_id]).to(device) |
|
sid = torch.LongTensor([control_spk_id]).to(device) |
|
text_padded, text_len = convert_text(text) |
|
y_enc, y_dec, attn = model.forward(text_padded, text_len, |
|
n_timesteps=args.timesteps, |
|
temperature=args.noise, |
|
stoc=args.stoc, spk=sid,emo=emo, length_scale=1., |
|
classifier_free_guidance=args.guidance) |
|
res = y_dec.squeeze().cpu().numpy() |
|
x = torch.from_numpy(res).cuda().unsqueeze(0) |
|
y_g_hat = vocoder(x) |
|
audio = y_g_hat.squeeze() |
|
audio = audio * 32768.0 |
|
audio = audio.detach().cpu().numpy().astype('int16') |
|
audio = AudioSegment(audio.data, frame_rate=22050, sample_width=2, channels=1) |
|
audio.export(f'{args.generated_path}/{emos[emo_i]}_{speakers[int(line.split("|")[2])]}.wav', format="wav") |