File size: 4,612 Bytes
4817bcc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import time
import os
import argparse
import torch
import numpy as np
import glob
from pathlib import Path
from tqdm import tqdm
from ppg_extractor import load_model
import librosa
import soundfile as sf
from utils.load_yaml import HpsYaml

from encoder.audio import preprocess_wav
from encoder import inference as speacker_encoder
from vocoder.hifigan import inference as vocoder
from ppg2mel import MelDecoderMOLv2
from utils.f0_utils import compute_f0, f02lf0, compute_mean_std, get_converted_lf0uv


def _build_ppg2mel_model(model_config, model_file, device):
    ppg2mel_model = MelDecoderMOLv2(
        **model_config["model"]
    ).to(device)
    ckpt = torch.load(model_file, map_location=device)
    ppg2mel_model.load_state_dict(ckpt["model"])
    ppg2mel_model.eval()
    return ppg2mel_model


@torch.no_grad()
def convert(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    output_dir = args.output_dir
    os.makedirs(output_dir, exist_ok=True)

    step = os.path.basename(args.ppg2mel_model_file)[:-4].split("_")[-1]

    # Build models
    print("Load PPG-model, PPG2Mel-model, Vocoder-model...")
    ppg_model = load_model(
        Path('./ppg_extractor/saved_models/24epoch.pt'),
        device,
    )
    ppg2mel_model = _build_ppg2mel_model(HpsYaml(args.ppg2mel_model_train_config), args.ppg2mel_model_file, device) 
    # vocoder.load_model('./vocoder/saved_models/pretrained/g_hifigan.pt', "./vocoder/hifigan/config_16k_.json")
    vocoder.load_model('./vocoder/saved_models/24k/g_02830000.pt')
    # Data related
    ref_wav_path = args.ref_wav_path
    ref_wav = preprocess_wav(ref_wav_path)
    ref_fid = os.path.basename(ref_wav_path)[:-4]
    
    # TODO: specify encoder
    speacker_encoder.load_model(Path("encoder/saved_models/pretrained_bak_5805000.pt"))
    ref_spk_dvec = speacker_encoder.embed_utterance(ref_wav)
    ref_spk_dvec = torch.from_numpy(ref_spk_dvec).unsqueeze(0).to(device)
    ref_lf0_mean, ref_lf0_std = compute_mean_std(f02lf0(compute_f0(ref_wav)))
    
    source_file_list = sorted(glob.glob(f"{args.wav_dir}/*.wav"))
    print(f"Number of source utterances: {len(source_file_list)}.")
    
    total_rtf = 0.0
    cnt = 0
    for src_wav_path in tqdm(source_file_list):
        # Load the audio to a numpy array:
        src_wav, _ = librosa.load(src_wav_path, sr=16000)
        src_wav_tensor = torch.from_numpy(src_wav).unsqueeze(0).float().to(device)
        src_wav_lengths = torch.LongTensor([len(src_wav)]).to(device)
        ppg = ppg_model(src_wav_tensor, src_wav_lengths)

        lf0_uv = get_converted_lf0uv(src_wav, ref_lf0_mean, ref_lf0_std, convert=True)
        min_len = min(ppg.shape[1], len(lf0_uv))

        ppg = ppg[:, :min_len]
        lf0_uv = lf0_uv[:min_len]
        
        start = time.time()
        _, mel_pred, att_ws = ppg2mel_model.inference(
            ppg,
            logf0_uv=torch.from_numpy(lf0_uv).unsqueeze(0).float().to(device),
            spembs=ref_spk_dvec,
        )
        src_fid = os.path.basename(src_wav_path)[:-4]
        wav_fname = f"{output_dir}/vc_{src_fid}_ref_{ref_fid}_step{step}.wav"
        mel_len = mel_pred.shape[0]
        rtf = (time.time() - start) / (0.01 * mel_len)
        total_rtf += rtf
        cnt += 1
        # continue
        mel_pred= mel_pred.transpose(0, 1)
        y, output_sample_rate = vocoder.infer_waveform(mel_pred.cpu())
        sf.write(wav_fname, y.squeeze(), output_sample_rate, "PCM_16")
    
    print("RTF:")
    print(total_rtf / cnt)


def get_parser():
    parser = argparse.ArgumentParser(description="Conversion from wave input")
    parser.add_argument(
        "--wav_dir",
        type=str,
        default=None,
        required=True,
        help="Source wave directory.",
    )
    parser.add_argument(
        "--ref_wav_path",
        type=str,
        required=True,
        help="Reference wave file path.",
    )
    parser.add_argument(
        "--ppg2mel_model_train_config", "-c",
        type=str,
        default=None,
        required=True,
        help="Training config file (yaml file)",
    )
    parser.add_argument(
        "--ppg2mel_model_file", "-m",
        type=str,
        default=None,
        required=True,
        help="ppg2mel model checkpoint file path"
    )
    parser.add_argument(
        "--output_dir", "-o",
        type=str,
        default="vc_gens_vctk_oneshot",
        help="Output folder to save the converted wave."
    )
    
    return parser

def main():
    parser = get_parser()
    args = parser.parse_args()
    convert(args)

if __name__ == "__main__":
    main()