{ "base_config": "egs/vocoder/gan/exp_config_base.json", "preprocess": { // acoustic features "extract_mel": true, "extract_audio": true, "extract_pitch": true, // Features used for model training "use_mel": true, "use_audio": true, "use_frame_pitch": true }, "model": { "generator": "nsfhifigan", "nsfhifigan": { "resblock": "1", "harmonic_num": 8, "upsample_rates": [ 8, 4, 2, 2, 2 ], "upsample_kernel_sizes": [ 16, 8, 4, 4, 4 ], "upsample_initial_channel": 768, "resblock_kernel_sizes": [ 3, 7, 11 ], "resblock_dilation_sizes": [ [ 1, 3, 5 ], [ 1, 3, 5 ], [ 1, 3, 5 ] ] }, "mpd": { "mpd_reshapes": [ 2, 3, 5, 7, 11, 17, 23, 37 ], "use_spectral_norm": false, "discriminator_channel_multi": 1 } }, "train": { "criterions": [ "feature", "discriminator", "generator", "mel", ] }, "inference": { "batch_size": 1, } }