|
|
|
|
|
|
|
|
|
|
|
lexicon: |
|
- AA |
|
- AE |
|
- AH |
|
- AO |
|
- AW |
|
- AY |
|
- B |
|
- CH |
|
- D |
|
- DH |
|
- EH |
|
- ER |
|
- EY |
|
- F |
|
- G |
|
- HH |
|
- IH |
|
- IY |
|
- JH |
|
- K |
|
- L |
|
- M |
|
- N |
|
- NG |
|
- OW |
|
- OY |
|
- P |
|
- R |
|
- S |
|
- SH |
|
- T |
|
- TH |
|
- UH |
|
- UW |
|
- V |
|
- W |
|
- Y |
|
- Z |
|
- ZH |
|
- spn |
|
|
|
n_symbols: 42 |
|
padding_idx: 0 |
|
n_mel_channels: 80 |
|
|
|
|
|
enc_num_layers: 4 |
|
enc_num_head: 2 |
|
enc_d_model: 384 |
|
enc_ffn_dim: 1024 |
|
enc_k_dim: 384 |
|
enc_v_dim: 384 |
|
enc_dropout: 0.2 |
|
|
|
|
|
dec_num_layers: 4 |
|
dec_num_head: 2 |
|
dec_d_model: 384 |
|
dec_ffn_dim: 1024 |
|
dec_k_dim: 384 |
|
dec_v_dim: 384 |
|
dec_dropout: 0.2 |
|
|
|
|
|
postnet_embedding_dim: 512 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 5 |
|
postnet_dropout: 0.5 |
|
|
|
|
|
normalize_before: True |
|
ffn_type: 1dcnn |
|
ffn_cnn_kernel_size_list: [9, 1] |
|
|
|
|
|
dur_pred_kernel_size: 3 |
|
pitch_pred_kernel_size: 3 |
|
energy_pred_kernel_size: 3 |
|
variance_predictor_dropout: 0.5 |
|
|
|
|
|
spn_predictor: !new:speechbrain.lobes.models.FastSpeech2.SPNPredictor |
|
enc_num_layers: !ref <enc_num_layers> |
|
enc_num_head: !ref <enc_num_head> |
|
enc_d_model: !ref <enc_d_model> |
|
enc_ffn_dim: !ref <enc_ffn_dim> |
|
enc_k_dim: !ref <enc_k_dim> |
|
enc_v_dim: !ref <enc_v_dim> |
|
enc_dropout: !ref <enc_dropout> |
|
normalize_before: !ref <normalize_before> |
|
ffn_type: !ref <ffn_type> |
|
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
|
n_char: !ref <n_symbols> |
|
padding_idx: !ref <padding_idx> |
|
|
|
|
|
model: !new:speechbrain.lobes.models.FastSpeech2.FastSpeech2 |
|
enc_num_layers: !ref <enc_num_layers> |
|
enc_num_head: !ref <enc_num_head> |
|
enc_d_model: !ref <enc_d_model> |
|
enc_ffn_dim: !ref <enc_ffn_dim> |
|
enc_k_dim: !ref <enc_k_dim> |
|
enc_v_dim: !ref <enc_v_dim> |
|
enc_dropout: !ref <enc_dropout> |
|
dec_num_layers: !ref <dec_num_layers> |
|
dec_num_head: !ref <dec_num_head> |
|
dec_d_model: !ref <dec_d_model> |
|
dec_ffn_dim: !ref <dec_ffn_dim> |
|
dec_k_dim: !ref <dec_k_dim> |
|
dec_v_dim: !ref <dec_v_dim> |
|
dec_dropout: !ref <dec_dropout> |
|
normalize_before: !ref <normalize_before> |
|
ffn_type: !ref <ffn_type> |
|
ffn_cnn_kernel_size_list: !ref <ffn_cnn_kernel_size_list> |
|
n_char: !ref <n_symbols> |
|
n_mels: !ref <n_mel_channels> |
|
postnet_embedding_dim: !ref <postnet_embedding_dim> |
|
postnet_kernel_size: !ref <postnet_kernel_size> |
|
postnet_n_convolutions: !ref <postnet_n_convolutions> |
|
postnet_dropout: !ref <postnet_dropout> |
|
padding_idx: !ref <padding_idx> |
|
dur_pred_kernel_size: !ref <dur_pred_kernel_size> |
|
pitch_pred_kernel_size: !ref <pitch_pred_kernel_size> |
|
energy_pred_kernel_size: !ref <energy_pred_kernel_size> |
|
variance_predictor_dropout: !ref <variance_predictor_dropout> |
|
|
|
|
|
input_encoder: !new:speechbrain.dataio.encoder.TextEncoder |
|
|
|
modules: |
|
spn_predictor: !ref <spn_predictor> |
|
model: !ref <model> |
|
|
|
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer |
|
loadables: |
|
spn_predictor: !ref <spn_predictor> |
|
model: !ref <model> |