# ################################ # Model: Fastspeech2 Internal Alignment # Authors: Yingzhi Wang # ################################ # Input parameters lexicon: - "AA" - "AE" - "AH" - "AO" - "AW" - "AY" - "B" - "CH" - "D" - "DH" - "EH" - "ER" - "EY" - "F" - "G" - "HH" - "IH" - "IY" - "JH" - "K" - "L" - "M" - "N" - "NG" - "OW" - "OY" - "P" - "R" - "S" - "SH" - "T" - "TH" - "UH" - "UW" - "V" - "W" - "Y" - "Z" - "ZH" - "-" - "!" - "'" - "(" - ")" - "," - "." - ":" - ";" - "?" - " " n_symbols: 52 #fixed depending on symbols in the lexicon (+1 for a dummy symbol used for padding, +1 for unknown) padding_idx: 0 n_mel_channels: 80 hidden_channels: 512 # Encoder parameters enc_num_layers: 4 enc_num_head: 2 enc_d_model: !ref enc_ffn_dim: 1024 enc_k_dim: !ref enc_v_dim: !ref enc_dropout: 0.2 # Aligner parameters in_query_channels: 80 in_key_channels: !ref attn_channels: 80 temperature: 0.0005 # Decoder parameters dec_num_layers: 4 dec_num_head: 2 dec_d_model: !ref dec_ffn_dim: 1024 dec_k_dim: !ref dec_v_dim: !ref dec_dropout: 0.2 # Postnet parameters postnet_embedding_dim: 512 postnet_kernel_size: 5 postnet_n_convolutions: 5 postnet_dropout: 0.2 # Common normalize_before: True ffn_type: 1dcnn #1dcnn or ffn ffn_cnn_kernel_size_list: [9, 1] # Variance predictor dur_pred_kernel_size: 3 pitch_pred_kernel_size: 3 energy_pred_kernel_size: 3 variance_predictor_dropout: 0.5 # Model model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2WithAlignment enc_num_layers: !ref enc_num_head: !ref enc_d_model: !ref enc_ffn_dim: !ref enc_k_dim: !ref enc_v_dim: !ref enc_dropout: !ref in_query_channels: !ref in_key_channels: !ref attn_channels: !ref temperature: !ref dec_num_layers: !ref dec_num_head: !ref dec_d_model: !ref dec_ffn_dim: !ref dec_k_dim: !ref dec_v_dim: !ref dec_dropout: !ref normalize_before: !ref ffn_type: !ref ffn_cnn_kernel_size_list: !ref n_char: !ref n_mels: !ref postnet_embedding_dim: !ref postnet_kernel_size: !ref postnet_n_convolutions: !ref postnet_dropout: !ref padding_idx: !ref dur_pred_kernel_size: !ref pitch_pred_kernel_size: !ref energy_pred_kernel_size: !ref variance_predictor_dropout: !ref input_encoder: !new:speechbrain.dataio.encoder.TextEncoder modules: model: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref