Spaces:

Rongjiehuang
/

GenerSpeech

Build error

File size: 5,604 Bytes

222619b

import glob
import re
import librosa
import torch
import yaml
from sklearn.preprocessing import StandardScaler
from torch import nn
from modules.parallel_wavegan.models import ParallelWaveGANGenerator
from modules.parallel_wavegan.utils import read_hdf5
from utils.hparams import hparams
from utils.pitch_utils import f0_to_coarse
from vocoders.base_vocoder import BaseVocoder, register_vocoder
import numpy as np


def load_pwg_model(config_path, checkpoint_path, stats_path):
    # load config
    with open(config_path) as f:
        config = yaml.load(f, Loader=yaml.Loader)

    # setup
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        device = torch.device("cpu")
    model = ParallelWaveGANGenerator(**config["generator_params"])

    ckpt_dict = torch.load(checkpoint_path, map_location="cpu")
    if 'state_dict' not in ckpt_dict:  # official vocoder
        model.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["model"]["generator"])
        scaler = StandardScaler()
        if config["format"] == "hdf5":
            scaler.mean_ = read_hdf5(stats_path, "mean")
            scaler.scale_ = read_hdf5(stats_path, "scale")
        elif config["format"] == "npy":
            scaler.mean_ = np.load(stats_path)[0]
            scaler.scale_ = np.load(stats_path)[1]
        else:
            raise ValueError("support only hdf5 or npy format.")
    else:  # custom PWG vocoder
        fake_task = nn.Module()
        fake_task.model_gen = model
        fake_task.load_state_dict(torch.load(checkpoint_path, map_location="cpu")["state_dict"], strict=False)
        scaler = None

    model.remove_weight_norm()
    model = model.eval().to(device)
    print(f"| Loaded model parameters from {checkpoint_path}.")
    print(f"| PWG device: {device}.")
    return model, scaler, config, device


@register_vocoder
class PWG(BaseVocoder):
    def __init__(self):
        if hparams['vocoder_ckpt'] == '':  # load LJSpeech PWG pretrained model
            base_dir = 'wavegan_pretrained'
            ckpts = glob.glob(f'{base_dir}/checkpoint-*steps.pkl')
            ckpt = sorted(ckpts, key=
            lambda x: int(re.findall(f'{base_dir}/checkpoint-(\d+)steps.pkl', x)[0]))[-1]
            config_path = f'{base_dir}/config.yaml'
            print('| load PWG: ', ckpt)
            self.model, self.scaler, self.config, self.device = load_pwg_model(
                config_path=config_path,
                checkpoint_path=ckpt,
                stats_path=f'{base_dir}/stats.h5',
            )
        else:
            base_dir = hparams['vocoder_ckpt']
            print(base_dir)
            config_path = f'{base_dir}/config.yaml'
            ckpt = sorted(glob.glob(f'{base_dir}/model_ckpt_steps_*.ckpt'), key=
            lambda x: int(re.findall(f'{base_dir}/model_ckpt_steps_(\d+).ckpt', x)[0]))[-1]
            print('| load PWG: ', ckpt)
            self.scaler = None
            self.model, _, self.config, self.device = load_pwg_model(
                config_path=config_path,
                checkpoint_path=ckpt,
                stats_path=f'{base_dir}/stats.h5',
            )

    def spec2wav(self, mel, **kwargs):
        # start generation
        config = self.config
        device = self.device
        pad_size = (config["generator_params"]["aux_context_window"],
                    config["generator_params"]["aux_context_window"])
        c = mel
        if self.scaler is not None:
            c = self.scaler.transform(c)

        with torch.no_grad():
            z = torch.randn(1, 1, c.shape[0] * config["hop_size"]).to(device)
            c = np.pad(c, (pad_size, (0, 0)), "edge")
            c = torch.FloatTensor(c).unsqueeze(0).transpose(2, 1).to(device)
            p = kwargs.get('f0')
            if p is not None:
                p = f0_to_coarse(p)
                p = np.pad(p, (pad_size,), "edge")
                p = torch.LongTensor(p[None, :]).to(device)
            y = self.model(z, c, p).view(-1)
        wav_out = y.cpu().numpy()
        return wav_out

    @staticmethod
    def wav2spec(wav_fn, return_linear=False):
        from data_gen.tts.data_gen_utils import process_utterance
        res = process_utterance(
            wav_fn, fft_size=hparams['fft_size'],
            hop_size=hparams['hop_size'],
            win_length=hparams['win_size'],
            num_mels=hparams['audio_num_mel_bins'],
            fmin=hparams['fmin'],
            fmax=hparams['fmax'],
            sample_rate=hparams['audio_sample_rate'],
            loud_norm=hparams['loud_norm'],
            min_level_db=hparams['min_level_db'],
            return_linear=return_linear, vocoder='pwg', eps=float(hparams.get('wav2spec_eps', 1e-10)))
        if return_linear:
            return res[0], res[1].T, res[2].T  # [T, 80], [T, n_fft]
        else:
            return res[0], res[1].T

    @staticmethod
    def wav2mfcc(wav_fn):
        fft_size = hparams['fft_size']
        hop_size = hparams['hop_size']
        win_length = hparams['win_size']
        sample_rate = hparams['audio_sample_rate']
        wav, _ = librosa.core.load(wav_fn, sr=sample_rate)
        mfcc = librosa.feature.mfcc(y=wav, sr=sample_rate, n_mfcc=13,
                                    n_fft=fft_size, hop_length=hop_size,
                                    win_length=win_length, pad_mode="constant", power=1.0)
        mfcc_delta = librosa.feature.delta(mfcc, order=1)
        mfcc_delta_delta = librosa.feature.delta(mfcc, order=2)
        mfcc = np.concatenate([mfcc, mfcc_delta, mfcc_delta_delta]).T
        return mfcc