CantusSVS-hf / scripts /vocode.py
liampond
Clean deploy snapshot
c42fe7e
# coding=utf8
import argparse
import os
import pathlib
import sys
root_dir = pathlib.Path(__file__).parent.parent.resolve()
os.environ['PYTHONPATH'] = str(root_dir)
sys.path.insert(0, str(root_dir))
import numpy as np
import torch
import tqdm
from inference.ds_acoustic import DiffSingerAcousticInfer
from utils.infer_utils import cross_fade, save_wav
from utils.hparams import set_hparams, hparams
parser = argparse.ArgumentParser(description='Run DiffSinger vocoder')
parser.add_argument('mel', type=str, help='Path to the input file')
parser.add_argument('--exp', type=str, required=False, help='Read vocoder class and path from chosen experiment')
parser.add_argument('--config', type=str, required=False, help='Read vocoder class and path from config file')
parser.add_argument('--class', type=str, required=False, help='Specify vocoder class')
parser.add_argument('--ckpt', type=str, required=False, help='Specify vocoder checkpoint path')
parser.add_argument('--out', type=str, required=False, help='Path of the output folder')
parser.add_argument('--title', type=str, required=False, help='Title of output file')
args = parser.parse_args()
mel = pathlib.Path(args.mel)
name = mel.stem if not args.title else args.title
config = None
if args.exp:
config = root_dir / 'checkpoints' / args.exp / 'config.yaml'
elif args.config:
config = pathlib.Path(args.config)
else:
assert False, 'Either argument \'--exp\' or \'--config\' should be specified.'
sys.argv = [
sys.argv[0],
'--config',
str(config)
]
set_hparams(print_hparams=False)
cls = getattr(args, 'class')
if cls:
hparams['vocoder'] = cls
if args.ckpt:
hparams['vocoder_ckpt'] = args.ckpt
out = args.out
if args.out:
out = pathlib.Path(args.out)
else:
out = mel.parent
mel_seq = torch.load(mel)
assert isinstance(mel_seq, list), 'Not a valid mel sequence.'
assert len(mel_seq) > 0, 'Mel sequence is empty.'
sample_rate = hparams['audio_sample_rate']
infer_ins = DiffSingerAcousticInfer(load_model=False)
def run_vocoder(path: pathlib.Path):
result = np.zeros(0)
current_length = 0
for seg_mel in tqdm.tqdm(mel_seq, desc='mel segment', total=len(mel_seq)):
seg_audio = infer_ins.run_vocoder(seg_mel['mel'].to(infer_ins.device), f0=seg_mel['f0'].to(infer_ins.device))
seg_audio = seg_audio.squeeze(0).cpu().numpy()
silent_length = round(seg_mel['offset'] * sample_rate) - current_length
if silent_length >= 0:
result = np.append(result, np.zeros(silent_length))
result = np.append(result, seg_audio)
else:
result = cross_fade(result, seg_audio, current_length + silent_length)
current_length = current_length + silent_length + seg_audio.shape[0]
print(f'| save audio: {path}')
save_wav(result, path, sample_rate)
os.makedirs(out, exist_ok=True)
try:
run_vocoder(out / (name + '.wav'))
except KeyboardInterrupt:
exit(-1)