espnet
/

english_male_ryanspeech_fastspeech2

Text-to-Speech

ESPnet

English

audio

Model card Files Files and versions Community

Roh commited on May 11, 2022

Commit

7cfd978

•

1 Parent(s): 342a0e6

Update README.md

Browse files

Files changed (1) hide show

README.md +49 -45

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ widget:
 ---
 ## RyanSpeech model (based on ESPnet2)
-### `espnet/english_male_ryanspeech_tacotron`
 This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
@@ -24,12 +24,12 @@ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/
 <details><summary>expand</summary>
 ```
-config: conf/train.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
-output_dir: exp/tts_train_raw_phn_tacotron_g2p_en_no_space
 ngpu: 1
 seed: 0
 num_workers: 1
@@ -48,7 +48,7 @@ cudnn_benchmark: false
 cudnn_deterministic: true
 collect_stats: false
 write_collected_feats: false
-max_epoch: 200
 patience: null
 val_scheduler_criterion:
 - valid
@@ -68,7 +68,7 @@ keep_nbest_models: 5
 grad_clip: 1.0
 grad_clip_type: 2.0
 grad_noise: false
-accum_grad: 1
 no_forward_run: false
 resume: true
 train_dtype: float32
@@ -79,14 +79,14 @@ pretrain_key: []
 num_iters_per_epoch: 500
 batch_size: 20
 valid_batch_size: null
-batch_bins: 5120000
 valid_batch_bins: null
 train_shape_file:
-- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
-- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
 valid_shape_file:
-- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
-- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
@@ -102,6 +102,9 @@ train_data_path_and_name_and_type:
 -   - dump/raw/tr_no_dev/text
     - text
     - text
 -   - dump/raw/tr_no_dev/wav.scp
     - speech
     - sound
@@ -109,6 +112,9 @@ valid_data_path_and_name_and_type:
 -   - dump/raw/dev/text
     - text
     - text
 -   - dump/raw/dev/wav.scp
     - speech
     - sound
@@ -118,11 +124,11 @@ max_cache_fd: 32
 valid_max_cache_size: null
 optim: adam
 optim_conf:
-    lr: 0.001
-    eps: 1.0e-06
-    weight_decay: 0.0
-scheduler: null
-scheduler_conf: {}
 token_list:
 - <blank>
 - <unk>
@@ -220,40 +226,37 @@ feats_extract_conf:
     win_length: null
 normalize: global_mvn
 normalize_conf:
-    stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
-tts: tacotron2
 tts_conf:
-    embed_dim: 512
-    elayers: 1
-    eunits: 512
-    econv_layers: 3
-    econv_chans: 512
-    econv_filts: 5
-    atype: location
-    adim: 512
-    aconv_chans: 32
-    aconv_filts: 15
-    cumulate_att_w: true
-    dlayers: 2
-    dunits: 1024
-    prenet_layers: 2
-    prenet_units: 256
     postnet_layers: 5
-    postnet_chans: 512
     postnet_filts: 5
-    output_activation: null
-    use_batch_norm: true
-    use_concate: true
-    use_residual: false
-    dropout_rate: 0.5
-    zoneout_rate: 0.1
-    reduction_factor: 1
-    spk_embed_dim: null
     use_masking: true
-    bce_pos_weight: 5.0
-    use_guided_attn_loss: true
-    guided_attn_loss_sigma: 0.4
-    guided_attn_loss_lambda: 1.0
 pitch_extract: null
 pitch_extract_conf: {}
 pitch_normalize: null
@@ -267,6 +270,7 @@ required:
 - token_list
 distributed: false
 ```
 </details>

 ---
 ## RyanSpeech model (based on ESPnet2)
+### `espnet/english_male_ryanspeech_fastspeech2`
 This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
 <details><summary>expand</summary>
 ```
+config: conf/tuning/train_fastspeech.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
+output_dir: exp/tts_train_fastspeech2_raw_phn_tacotron_g2p_en_no_space
 ngpu: 1
 seed: 0
 num_workers: 1
 cudnn_deterministic: true
 collect_stats: false
 write_collected_feats: false
+max_epoch: 1000
 patience: null
 val_scheduler_criterion:
 - valid
 grad_clip: 1.0
 grad_clip_type: 2.0
 grad_noise: false
+accum_grad: 6
 no_forward_run: false
 resume: true
 train_dtype: float32
 num_iters_per_epoch: 500
 batch_size: 20
 valid_batch_size: null
+batch_bins: 800000
 valid_batch_bins: null
 train_shape_file:
+- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
+- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
 valid_shape_file:
+- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
+- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
 -   - dump/raw/tr_no_dev/text
     - text
     - text
+-   - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
+    - durations
+    - text_int
 -   - dump/raw/tr_no_dev/wav.scp
     - speech
     - sound
 -   - dump/raw/dev/text
     - text
     - text
+-   - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
+    - durations
+    - text_int
 -   - dump/raw/dev/wav.scp
     - speech
     - sound
 valid_max_cache_size: null
 optim: adam
 optim_conf:
+    lr: 1.0
+scheduler: noamlr
+scheduler_conf:
+    model_size: 384
+    warmup_steps: 4000
 token_list:
 - <blank>
 - <unk>
     win_length: null
 normalize: global_mvn
 normalize_conf:
+    stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
+tts: fastspeech
 tts_conf:
+    adim: 384
+    aheads: 2
+    elayers: 6
+    eunits: 1536
+    dlayers: 6
+    dunits: 1536
+    positionwise_layer_type: conv1d
+    positionwise_conv_kernel_size: 3
+    duration_predictor_layers: 2
+    duration_predictor_chans: 384
+    duration_predictor_kernel_size: 3
     postnet_layers: 5
     postnet_filts: 5
+    postnet_chans: 256
     use_masking: true
+    use_scaled_pos_enc: true
+    encoder_normalize_before: true
+    decoder_normalize_before: true
+    reduction_factor: 1
+    init_type: xavier_uniform
+    init_enc_alpha: 1.0
+    init_dec_alpha: 1.0
+    transformer_enc_dropout_rate: 0.1
+    transformer_enc_positional_dropout_rate: 0.1
+    transformer_enc_attn_dropout_rate: 0.1
+    transformer_dec_dropout_rate: 0.1
+    transformer_dec_positional_dropout_rate: 0.1
+    transformer_dec_attn_dropout_rate: 0.1
 pitch_extract: null
 pitch_extract_conf: {}
 pitch_normalize: null
 - token_list
 distributed: false
 ```
 </details>