espnet
/

english_male_ryanspeech_tacotron

Text-to-Speech

ESPnet

English

audio

Model card Files Files and versions Community

Roh commited on May 11, 2022

Commit

ac55289

•

1 Parent(s): f6455e0

Update README.md

Browse files

Files changed (1) hide show

README.md +46 -48

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ widget:
 ---
 ## RyanSpeech model (based on ESPnet2)
-### `espnet/english_ryanspeech_fastspeech`
 This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
@@ -24,12 +24,12 @@ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/
 <details><summary>expand</summary>
 ```
-config: conf/tuning/train_fastspeech.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
-output_dir: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space
 ngpu: 1
 seed: 0
 num_workers: 1
@@ -48,7 +48,7 @@ cudnn_benchmark: false
 cudnn_deterministic: true
 collect_stats: false
 write_collected_feats: false
-max_epoch: 1000
 patience: null
 val_scheduler_criterion:
 - valid
@@ -68,7 +68,7 @@ keep_nbest_models: 5
 grad_clip: 1.0
 grad_clip_type: 2.0
 grad_noise: false
-accum_grad: 6
 no_forward_run: false
 resume: true
 train_dtype: float32
@@ -79,14 +79,14 @@ pretrain_key: []
 num_iters_per_epoch: 500
 batch_size: 20
 valid_batch_size: null
-batch_bins: 800000
 valid_batch_bins: null
 train_shape_file:
-- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/text_shape.phn
-- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/speech_shape
 valid_shape_file:
-- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/text_shape.phn
-- exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
@@ -102,9 +102,6 @@ train_data_path_and_name_and_type:
 -   - dump/raw/tr_no_dev/text
     - text
     - text
--   - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//tr_no_dev/durations
-    - durations
-    - text_int
 -   - dump/raw/tr_no_dev/wav.scp
     - speech
     - sound
@@ -112,9 +109,6 @@ valid_data_path_and_name_and_type:
 -   - dump/raw/dev/text
     - text
     - text
--   - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//dev/durations
-    - durations
-    - text_int
 -   - dump/raw/dev/wav.scp
     - speech
     - sound
@@ -124,11 +118,11 @@ max_cache_fd: 32
 valid_max_cache_size: null
 optim: adam
 optim_conf:
-    lr: 1.0
-scheduler: noamlr
-scheduler_conf:
-    model_size: 384
-    warmup_steps: 4000
 token_list:
 - <blank>
 - <unk>
@@ -226,37 +220,40 @@ feats_extract_conf:
     win_length: null
 normalize: global_mvn
 normalize_conf:
-    stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/feats_stats.npz
-tts: fastspeech
 tts_conf:
-    adim: 384
-    aheads: 2
-    elayers: 6
-    eunits: 1536
-    dlayers: 6
-    dunits: 1536
-    positionwise_layer_type: conv1d
-    positionwise_conv_kernel_size: 3
-    duration_predictor_layers: 2
-    duration_predictor_chans: 384
-    duration_predictor_kernel_size: 3
     postnet_layers: 5
     postnet_filts: 5
-    postnet_chans: 256
-    use_masking: true
-    use_scaled_pos_enc: true
-    encoder_normalize_before: true
-    decoder_normalize_before: true
     reduction_factor: 1
-    init_type: xavier_uniform
-    init_enc_alpha: 1.0
-    init_dec_alpha: 1.0
-    transformer_enc_dropout_rate: 0.1
-    transformer_enc_positional_dropout_rate: 0.1
-    transformer_enc_attn_dropout_rate: 0.1
-    transformer_dec_dropout_rate: 0.1
-    transformer_dec_positional_dropout_rate: 0.1
-    transformer_dec_attn_dropout_rate: 0.1
 pitch_extract: null
 pitch_extract_conf: {}
 pitch_normalize: null
@@ -269,6 +266,7 @@ required:
 - output_dir
 - token_list
 distributed: false
 ```
 </details>

 ---
 ## RyanSpeech model (based on ESPnet2)
+### `espnet/english_male_ryanspeech_tacotron`
 This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
 <details><summary>expand</summary>
 ```
+config: conf/train.yaml
 print_config: false
 log_level: INFO
 dry_run: false
 iterator_type: sequence
+output_dir: exp/tts_train_raw_phn_tacotron_g2p_en_no_space
 ngpu: 1
 seed: 0
 num_workers: 1
 cudnn_deterministic: true
 collect_stats: false
 write_collected_feats: false
+max_epoch: 200
 patience: null
 val_scheduler_criterion:
 - valid
 grad_clip: 1.0
 grad_clip_type: 2.0
 grad_noise: false
+accum_grad: 1
 no_forward_run: false
 resume: true
 train_dtype: float32
 num_iters_per_epoch: 500
 batch_size: 20
 valid_batch_size: null
+batch_bins: 5120000
 valid_batch_bins: null
 train_shape_file:
+- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
+- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
 valid_shape_file:
+- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
+- exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
 batch_type: numel
 valid_batch_type: null
 fold_length:
 -   - dump/raw/tr_no_dev/text
     - text
     - text
 -   - dump/raw/tr_no_dev/wav.scp
     - speech
     - sound
 -   - dump/raw/dev/text
     - text
     - text
 -   - dump/raw/dev/wav.scp
     - speech
     - sound
 valid_max_cache_size: null
 optim: adam
 optim_conf:
+    lr: 0.001
+    eps: 1.0e-06
+    weight_decay: 0.0
+scheduler: null
+scheduler_conf: {}
 token_list:
 - <blank>
 - <unk>
     win_length: null
 normalize: global_mvn
 normalize_conf:
+    stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
+tts: tacotron2
 tts_conf:
+    embed_dim: 512
+    elayers: 1
+    eunits: 512
+    econv_layers: 3
+    econv_chans: 512
+    econv_filts: 5
+    atype: location
+    adim: 512
+    aconv_chans: 32
+    aconv_filts: 15
+    cumulate_att_w: true
+    dlayers: 2
+    dunits: 1024
+    prenet_layers: 2
+    prenet_units: 256
     postnet_layers: 5
+    postnet_chans: 512
     postnet_filts: 5
+    output_activation: null
+    use_batch_norm: true
+    use_concate: true
+    use_residual: false
+    dropout_rate: 0.5
+    zoneout_rate: 0.1
     reduction_factor: 1
+    spk_embed_dim: null
+    use_masking: true
+    bce_pos_weight: 5.0
+    use_guided_attn_loss: true
+    guided_attn_loss_sigma: 0.4
+    guided_attn_loss_lambda: 1.0
 pitch_extract: null
 pitch_extract_conf: {}
 pitch_normalize: null
 - output_dir
 - token_list
 distributed: false
 ```
 </details>