import numpy as np import os import torch import commons import models import utils from argparse import ArgumentParser from tqdm import tqdm from text import text_to_sequence if __name__ == "__main__": parser = ArgumentParser() parser.add_argument("-m", "--model_dir", required=True, type=str) parser.add_argument("-s", "--mels_dir", required=True, type=str) args = parser.parse_args() MODEL_DIR = args.model_dir # path to model dir SAVE_MELS_DIR = args.mels_dir # path to save generated mels if not os.path.exists(SAVE_MELS_DIR): os.makedirs(SAVE_MELS_DIR) hps = utils.get_hparams_from_dir(MODEL_DIR) symbols = list( + list( checkpoint_path = utils.latest_checkpoint_path(MODEL_DIR) cleaner = model = models.FlowGenerator( len(symbols) + getattr(, "add_blank", False),, **hps.model ).to("cuda") utils.load_checkpoint(checkpoint_path, model) model.decoder.store_inverse() # do not calcuate jacobians for fast decoding _ = model.eval() def get_mel(text, fpath): if getattr(, "add_blank", False): text_norm = text_to_sequence(text, symbols, cleaner) text_norm = commons.intersperse(text_norm, len(symbols)) else: # If not using "add_blank" option during training, adding spaces at the beginning and the end of utterance improves quality text = " " + text.strip() + " " text_norm = text_to_sequence(text, symbols, cleaner) sequence = np.array(text_norm)[None, :] x_tst = torch.autograd.Variable(torch.from_numpy(sequence)).cuda().long() x_tst_lengths = torch.tensor([x_tst.shape[1]]).cuda() with torch.no_grad(): noise_scale = 0.667 length_scale = 1.0 (y_gen_tst, *_), *_, (attn_gen, *_) = model( x_tst, x_tst_lengths, gen=True, noise_scale=noise_scale, length_scale=length_scale, ), fpath), y_gen_tst.cpu().detach().numpy()) for f in [,]: file_lines = open(f).read().splitlines() for line in tqdm(file_lines): fname, text = line.split("|") fname = os.path.basename(fname).replace(".wav", ".npy") get_mel(text, fname)