yingzhi's picture
Update hyperparams.yaml
122f8f6
# ################################
# Model: Fastspeech2 Internal Alignment
# Authors: Yingzhi Wang
# ################################
# Input parameters
lexicon:
- "AA"
- "AE"
- "AH"
- "AO"
- "AW"
- "AY"
- "B"
- "CH"
- "D"
- "DH"
- "EH"
- "ER"
- "EY"
- "F"
- "G"
- "HH"
- "IH"
- "IY"
- "JH"
- "K"
- "L"
- "M"
- "N"
- "NG"
- "OW"
- "OY"
- "P"
- "R"
- "S"
- "SH"
- "T"
- "TH"
- "UH"
- "UW"
- "V"
- "W"
- "Y"
- "Z"
- "ZH"
- "-"
- "!"
- "'"
- "("
- ")"
- ","
- "."
- ":"
- ";"
- "?"
- " "
n_symbols: 52 #fixed depending on symbols in the lexicon (+1 for a dummy symbol used for padding, +1 for unknown)
padding_idx: 0
n_mel_channels: 80
hidden_channels: 512
# Encoder parameters
enc_num_layers: 4
enc_num_head: 2
enc_d_model: !ref <hidden_channels>
enc_ffn_dim: 1024
enc_k_dim: !ref <hidden_channels>
enc_v_dim: !ref <hidden_channels>
enc_dropout: 0.2
# Aligner parameters
in_query_channels: 80
in_key_channels: !ref <hidden_channels>
attn_channels: 80
temperature: 0.0005
# Decoder parameters
dec_num_layers: 4
dec_num_head: 2
dec_d_model: !ref <hidden_channels>
dec_ffn_dim: 1024
dec_k_dim: !ref <hidden_channels>
dec_v_dim: !ref <hidden_channels>
dec_dropout: 0.2
# Postnet parameters
postnet_embedding_dim: 512
postnet_kernel_size: 5
postnet_n_convolutions: 5
postnet_dropout: 0.2
# Common
normalize_before: True
ffn_type: 1dcnn #1dcnn or ffn
ffn_cnn_kernel_size_list: [9, 1]
# Variance predictor
dur_pred_kernel_size: 3
pitch_pred_kernel_size: 3
energy_pred_kernel_size: 3
variance_predictor_dropout: 0.5
# Model
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment
enc_num_layers: !ref <enc_num_layers>
enc_num_head: !ref <enc_num_head>
enc_d_model: !ref <enc_d_model>
enc_ffn_dim: !ref <enc_ffn_dim>
enc_k_dim: !ref <enc_k_dim>
enc_v_dim: !ref <enc_v_dim>
enc_dropout: !ref <enc_dropout>
in_query_channels: !ref <in_query_channels>
in_key_channels: !ref <in_key_channels>
attn_channels: !ref <attn_channels>
temperature: !ref <temperature>
dec_num_layers: !ref <dec_num_layers>
dec_num_head: !ref <dec_num_head>
dec_d_model: !ref <dec_d_model>
dec_ffn_dim: !ref <dec_ffn_dim>
dec_k_dim: !ref <dec_k_dim>
dec_v_dim: !ref <dec_v_dim>
dec_dropout: !ref <dec_dropout>
normalize_before: !ref <normalize_before>
ffn_type: !ref <ffn_type>
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list>
n_char: !ref <n_symbols>
n_mels: !ref <n_mel_channels>
postnet_embedding_dim: !ref <postnet_embedding_dim>
postnet_kernel_size: !ref <postnet_kernel_size>
postnet_n_convolutions: !ref <postnet_n_convolutions>
postnet_dropout: !ref <postnet_dropout>
padding_idx: !ref <padding_idx>
dur_pred_kernel_size: !ref <dur_pred_kernel_size>
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size>
energy_pred_kernel_size: !ref <energy_pred_kernel_size>
variance_predictor_dropout: !ref <variance_predictor_dropout>
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder
modules:
model: !ref <model>
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
model: !ref <model>