ukrainian-tts / training /finetune_joint_tacotron2_hifigan.yaml
Yurii Paniv
Add joint tacotron2_hifigan config
c527edf
raw history blame
No virus
10.5 kB
# This EXPERIMENTAL configuration is for ESPnet2 to finetune
# Conformer FastSpeech2 + HiFiGAN vocoder jointly. To run
# this config, you need to specify "--tts_task gan_tts"
# option for tts.sh at least and use 22050 hz audio as the
# training data (mainly tested on LJspeech).
# This configuration tested on 4 GPUs with 12GB GPU memory.
# It takes around less than 1 week to finish the training but
# 100k iters model should generate reasonable results.
# YOU NEED TO MODIFY THE "*_params" AND "init_param" SECTIONS
# IF YOU WANT TO USE YOUR OWN PRETRAINED MODLES.
##########################################################
# TTS MODEL SETTING #
##########################################################
tts: joint_text2wav
tts_conf:
# copied from pretrained model's config.yaml
text2mel_type: tacotron2
text2mel_params:
embed_dim: 512 # char or phn embedding dimension
elayers: 1 # number of blstm layers in encoder
eunits: 512 # number of blstm units
econv_layers: 3 # number of convolutional layers in encoder
econv_chans: 512 # number of channels in convolutional layer
econv_filts: 5 # filter size of convolutional layer
atype: location # attention function type
adim: 512 # attention dimension
aconv_chans: 32 # number of channels in convolutional layer of attention
aconv_filts: 15 # filter size of convolutional layer of attention
cumulate_att_w: true # whether to cumulate attention weight
dlayers: 2 # number of lstm layers in decoder
dunits: 1024 # number of lstm units in decoder
prenet_layers: 2 # number of layers in prenet
prenet_units: 256 # number of units in prenet
postnet_layers: 5 # number of layers in postnet
postnet_chans: 512 # number of channels in postnet
postnet_filts: 5 # filter size of postnet layer
output_activation: null # activation function for the final output
use_batch_norm: true # whether to use batch normalization in encoder
use_concate: true # whether to concatenate encoder embedding with decoder outputs
use_residual: false # whether to use residual connection in encoder
spk_embed_dim: 192 # speaker embedding dimension
spk_embed_integration_type: add # how to integrate speaker embedding
dropout_rate: 0.5 # dropout rate
zoneout_rate: 0.1 # zoneout rate
reduction_factor: 1 # reduction factor
use_masking: true # whether to apply masking for padded part in loss calculation
bce_pos_weight: 10.0 # weight of positive sample in binary cross entropy calculation
use_guided_attn_loss: true # whether to use guided attention loss
guided_attn_loss_sigma: 0.4 # sigma of guided attention loss
guided_attn_loss_lambda: 1.0 # strength of guided attention loss
# copied from pretrained vocoder's config.yaml
vocoder_type: hifigan_generator
vocoder_params:
bias: true
channels: 512
in_channels: 80
kernel_size: 7
nonlinear_activation: LeakyReLU
nonlinear_activation_params:
negative_slope: 0.1
out_channels: 1
resblock_dilations:
- - 1
- 3
- 5
- - 1
- 3
- 5
- - 1
- 3
- 5
resblock_kernel_sizes:
- 3
- 7
- 11
upsample_kernel_sizes:
- 16
- 16
- 4
- 4
upsample_scales:
- 8
- 8
- 2
- 2
use_additional_convs: true
use_weight_norm: true
# copied from pretrained vocoder's config.yaml
discriminator_type: hifigan_multi_scale_multi_period_discriminator
discriminator_params:
follow_official_norm: true
period_discriminator_params:
bias: true
channels: 32
downsample_scales:
- 3
- 3
- 3
- 3
- 1
in_channels: 1
kernel_sizes:
- 5
- 3
max_downsample_channels: 1024
nonlinear_activation: LeakyReLU
nonlinear_activation_params:
negative_slope: 0.1
out_channels: 1
use_spectral_norm: false
use_weight_norm: true
periods:
- 2
- 3
- 5
- 7
- 11
scale_discriminator_params:
bias: true
channels: 128
downsample_scales:
- 4
- 4
- 4
- 4
- 1
in_channels: 1
kernel_sizes:
- 15
- 41
- 5
- 3
max_downsample_channels: 1024
max_groups: 16
nonlinear_activation: LeakyReLU
nonlinear_activation_params:
negative_slope: 0.1
out_channels: 1
scale_downsample_pooling: AvgPool1d
scale_downsample_pooling_params:
kernel_size: 4
padding: 2
stride: 2
scales: 3
# loss function related
generator_adv_loss_params:
average_by_discriminators: false # whether to average loss value by #discriminators
loss_type: mse # loss type, "mse" or "hinge"
discriminator_adv_loss_params:
average_by_discriminators: false # whether to average loss value by #discriminators
loss_type: mse # loss type, "mse" or "hinge"
use_feat_match_loss: true # whether to use feat match loss
feat_match_loss_params:
average_by_discriminators: false # whether to average loss value by #discriminators
average_by_layers: false # whether to average loss value by #layers of each discriminator
include_final_outputs: true # whether to include final outputs for loss calculation
use_mel_loss: true # whether to use mel-spectrogram loss
mel_loss_params:
fs: 22050 # must be the same as the training data
n_fft: 1024 # fft points
hop_length: 256 # hop size
win_length: null # window length
window: hann # window type
n_mels: 80 # number of Mel basis
fmin: 0 # minimum frequency for Mel basis
fmax: null # maximum frequency for Mel basis
log_base: null # null represent natural log
lambda_text2mel: 1.0 # loss scaling coefficient for text2mel loss
lambda_adv: 1.0 # loss scaling coefficient for adversarial loss
lambda_mel: 45.0 # loss scaling coefficient for Mel loss
lambda_feat_match: 2.0 # loss scaling coefficient for feat match loss
# others
sampling_rate: 22050 # needed in the inference for saving wav
segment_size: 32 # segment size for random windowed discriminator
cache_generator_outputs: true # whether to cache generator outputs in the training
# extra module for additional inputs
#pitch_extract: dio # pitch extractor type
#pitch_extract_conf:
# reduction_factor: 1
#pitch_normalize: global_mvn # normalizer for the pitch feature
#energy_extract: energy # energy extractor type
#energy_extract_conf:
# reduction_factor: 1
#energy_normalize: global_mvn # normalizer for the energy feature
# initialization (might need to modify for your own pretrained model)
init_param:
- exp/22k/tts_train_tacotron2_raw_char/train.loss.ave_5best.pth:tts:tts.generator.text2mel
- exp/22k/ljspeech_hifigan.v1/generator.pth::tts.generator.vocoder
- exp/22k/ljspeech_hifigan.v1/discriminator.pth::tts.discriminator
##########################################################
# OPTIMIZER & SCHEDULER SETTING #
##########################################################
# optimizer setting for generator
optim: adam
optim_conf:
lr: 1.25e-5
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler: exponentiallr
scheduler_conf:
gamma: 0.999875
# optimizer setting for discriminator
optim2: adam
optim2_conf:
lr: 1.25e-5
betas: [0.5, 0.9]
weight_decay: 0.0
scheduler2: exponentiallr
scheduler2_conf:
gamma: 0.999875
generator_first: true # whether to start updating generator first
##########################################################
# OTHER TRAINING SETTING #
##########################################################
#num_iters_per_epoch: 1000 # number of iterations per epoch
max_epoch: 500 # number of epochs
accum_grad: 1 # gradient accumulation
batch_bins: 1600000 # batch bins (feats_type=raw)
batch_type: numel # how to make batch
grad_clip: -1 # gradient clipping norm
grad_noise: false # whether to use gradient noise injection
sort_in_batch: descending # how to sort data in making batch
sort_batch: descending # how to sort created batches
num_workers: 4 # number of workers of data loader
use_amp: false # whether to use pytorch amp
log_interval: 50 # log interval in iterations
keep_nbest_models: 5 # number of models to keep
num_att_plot: 3 # number of attention figures to be saved in every check
seed: 777 # random seed number
patience: null # patience for early stopping
unused_parameters: true # needed for multi gpu case
best_model_criterion: # criterion to save the best models
- - valid
- text2mel_loss
- min
- - train
- text2mel_loss
- min
- - train
- total_count
- max
cudnn_deterministic: false # setting to false accelerates the training speed but makes it non-deterministic
# in the case of GAN-TTS training, we strongly recommend setting to false
cudnn_benchmark: false # setting to true might acdelerate the training speed but sometimes decrease it
# therefore, we set to false as a default (recommend trying both cases)