Spaces:
Runtime error
Runtime error
# Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. | |
# This program is free software; you can redistribute it and/or modify | |
# it under the terms of the MIT License. | |
# This program is distributed in the hope that it will be useful, | |
# but WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
# MIT License for more details. | |
import argparse | |
import json | |
import datetime as dt | |
import numpy as np | |
from scipy.io.wavfile import write | |
import torch | |
import params | |
from model import GradTTS | |
from text import text_to_sequence, cmudict | |
from text.symbols import symbols | |
from utils import intersperse | |
import sys | |
sys.path.append('./hifi-gan/') | |
from env import AttrDict | |
from models import Generator as HiFiGAN | |
HIFIGAN_CONFIG = './checkpts/hifigan-config.json' | |
HIFIGAN_CHECKPT = './checkpts/hifigan.pt' | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser() | |
parser.add_argument('-f', '--file', type=str, required=True, help='path to a file with texts to synthesize') | |
parser.add_argument('-c', '--checkpoint', type=str, required=True, help='path to a checkpoint of Grad-TTS') | |
parser.add_argument('-t', '--timesteps', type=int, required=False, default=10, help='number of timesteps of reverse diffusion') | |
parser.add_argument('-s', '--speaker_id', type=int, required=False, default=None, help='speaker id for multispeaker model') | |
args = parser.parse_args() | |
if not isinstance(args.speaker_id, type(None)): | |
assert params.n_spks > 1, "Ensure you set right number of speakers in `params.py`." | |
spk = torch.LongTensor([args.speaker_id]).cuda() | |
else: | |
spk = None | |
print('Initializing Grad-TTS...') | |
generator = GradTTS(len(symbols)+1, params.n_spks, params.spk_emb_dim, | |
params.n_enc_channels, params.filter_channels, | |
params.filter_channels_dp, params.n_heads, params.n_enc_layers, | |
params.enc_kernel, params.enc_dropout, params.window_size, | |
params.n_feats, params.dec_dim, params.beta_min, params.beta_max, params.pe_scale) | |
generator.load_state_dict(torch.load(args.checkpoint, map_location=lambda loc, storage: loc)) | |
_ = generator.cuda().eval() | |
print(f'Number of parameters: {generator.nparams}') | |
print('Initializing HiFi-GAN...') | |
with open(HIFIGAN_CONFIG) as f: | |
h = AttrDict(json.load(f)) | |
vocoder = HiFiGAN(h) | |
vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) | |
_ = vocoder.cuda().eval() | |
vocoder.remove_weight_norm() | |
with open(args.file, 'r', encoding='utf-8') as f: | |
texts = [line.strip() for line in f.readlines()] | |
cmu = cmudict.CMUDict('./resources/cmu_dictionary') | |
with torch.no_grad(): | |
for i, text in enumerate(texts): | |
print(f'Synthesizing {i} text...', end=' ') | |
x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=cmu), len(symbols))).cuda()[None] | |
x_lengths = torch.LongTensor([x.shape[-1]]).cuda() | |
t = dt.datetime.now() | |
y_enc, y_dec, attn = generator.forward(x, x_lengths, n_timesteps=args.timesteps, temperature=1.5, | |
stoc=False, spk=spk, length_scale=0.91) | |
t = (dt.datetime.now() - t).total_seconds() | |
print(f'Grad-TTS RTF: {t * 22050 / (y_dec.shape[-1] * 256)}') | |
audio = (vocoder.forward(y_dec).cpu().squeeze().clamp(-1, 1).numpy() * 32768).astype(np.int16) | |
write(f'./out/sample_{i}.wav', 22050, audio) | |
print('Done. Check out `out` folder for samples.') | |