Roh commited on
Commit
7cfd978
1 Parent(s): 342a0e6

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +49 -45
README.md CHANGED
@@ -12,7 +12,7 @@ widget:
12
  ---
13
  ## RyanSpeech model (based on ESPnet2)
14
 
15
- ### `espnet/english_male_ryanspeech_tacotron`
16
  This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
 
@@ -24,12 +24,12 @@ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/
24
  <details><summary>expand</summary>
25
 
26
  ```
27
- config: conf/train.yaml
28
  print_config: false
29
  log_level: INFO
30
  dry_run: false
31
  iterator_type: sequence
32
- output_dir: exp/tts_train_raw_phn_tacotron_g2p_en_no_space
33
  ngpu: 1
34
  seed: 0
35
  num_workers: 1
@@ -48,7 +48,7 @@ cudnn_benchmark: false
48
  cudnn_deterministic: true
49
  collect_stats: false
50
  write_collected_feats: false
51
- max_epoch: 200
52
  patience: null
53
  val_scheduler_criterion:
54
  - valid
@@ -68,7 +68,7 @@ keep_nbest_models: 5
68
  grad_clip: 1.0
69
  grad_clip_type: 2.0
70
  grad_noise: false
71
- accum_grad: 1
72
  no_forward_run: false
73
  resume: true
74
  train_dtype: float32
@@ -79,14 +79,14 @@ pretrain_key: []
79
  num_iters_per_epoch: 500
80
  batch_size: 20
81
  valid_batch_size: null
82
- batch_bins: 5120000
83
  valid_batch_bins: null
84
  train_shape_file:
85
- - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
86
- - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
87
  valid_shape_file:
88
- - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
89
- - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
90
  batch_type: numel
91
  valid_batch_type: null
92
  fold_length:
@@ -102,6 +102,9 @@ train_data_path_and_name_and_type:
102
  - - dump/raw/tr_no_dev/text
103
  - text
104
  - text
 
 
 
105
  - - dump/raw/tr_no_dev/wav.scp
106
  - speech
107
  - sound
@@ -109,6 +112,9 @@ valid_data_path_and_name_and_type:
109
  - - dump/raw/dev/text
110
  - text
111
  - text
 
 
 
112
  - - dump/raw/dev/wav.scp
113
  - speech
114
  - sound
@@ -118,11 +124,11 @@ max_cache_fd: 32
118
  valid_max_cache_size: null
119
  optim: adam
120
  optim_conf:
121
- lr: 0.001
122
- eps: 1.0e-06
123
- weight_decay: 0.0
124
- scheduler: null
125
- scheduler_conf: {}
126
  token_list:
127
  - <blank>
128
  - <unk>
@@ -220,40 +226,37 @@ feats_extract_conf:
220
  win_length: null
221
  normalize: global_mvn
222
  normalize_conf:
223
- stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
224
- tts: tacotron2
225
  tts_conf:
226
- embed_dim: 512
227
- elayers: 1
228
- eunits: 512
229
- econv_layers: 3
230
- econv_chans: 512
231
- econv_filts: 5
232
- atype: location
233
- adim: 512
234
- aconv_chans: 32
235
- aconv_filts: 15
236
- cumulate_att_w: true
237
- dlayers: 2
238
- dunits: 1024
239
- prenet_layers: 2
240
- prenet_units: 256
241
  postnet_layers: 5
242
- postnet_chans: 512
243
  postnet_filts: 5
244
- output_activation: null
245
- use_batch_norm: true
246
- use_concate: true
247
- use_residual: false
248
- dropout_rate: 0.5
249
- zoneout_rate: 0.1
250
- reduction_factor: 1
251
- spk_embed_dim: null
252
  use_masking: true
253
- bce_pos_weight: 5.0
254
- use_guided_attn_loss: true
255
- guided_attn_loss_sigma: 0.4
256
- guided_attn_loss_lambda: 1.0
 
 
 
 
 
 
 
 
 
257
  pitch_extract: null
258
  pitch_extract_conf: {}
259
  pitch_normalize: null
@@ -267,6 +270,7 @@ required:
267
  - token_list
268
  distributed: false
269
 
 
270
  ```
271
 
272
  </details>
 
12
  ---
13
  ## RyanSpeech model (based on ESPnet2)
14
 
15
+ ### `espnet/english_male_ryanspeech_fastspeech2`
16
  This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
 
 
24
  <details><summary>expand</summary>
25
 
26
  ```
27
+ config: conf/tuning/train_fastspeech.yaml
28
  print_config: false
29
  log_level: INFO
30
  dry_run: false
31
  iterator_type: sequence
32
+ output_dir: exp/tts_train_fastspeech2_raw_phn_tacotron_g2p_en_no_space
33
  ngpu: 1
34
  seed: 0
35
  num_workers: 1
 
48
  cudnn_deterministic: true
49
  collect_stats: false
50
  write_collected_feats: false
51
+ max_epoch: 1000
52
  patience: null
53
  val_scheduler_criterion:
54
  - valid
 
68
  grad_clip: 1.0
69
  grad_clip_type: 2.0
70
  grad_noise: false
71
+ accum_grad: 6
72
  no_forward_run: false
73
  resume: true
74
  train_dtype: float32
 
79
  num_iters_per_epoch: 500
80
  batch_size: 20
81
  valid_batch_size: null
82
+ batch_bins: 800000
83
  valid_batch_bins: null
84
  train_shape_file:
85
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
86
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
87
  valid_shape_file:
88
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
89
+ - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
90
  batch_type: numel
91
  valid_batch_type: null
92
  fold_length:
 
102
  - - dump/raw/tr_no_dev/text
103
  - text
104
  - text
105
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
106
+ - durations
107
+ - text_int
108
  - - dump/raw/tr_no_dev/wav.scp
109
  - speech
110
  - sound
 
112
  - - dump/raw/dev/text
113
  - text
114
  - text
115
+ - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
116
+ - durations
117
+ - text_int
118
  - - dump/raw/dev/wav.scp
119
  - speech
120
  - sound
 
124
  valid_max_cache_size: null
125
  optim: adam
126
  optim_conf:
127
+ lr: 1.0
128
+ scheduler: noamlr
129
+ scheduler_conf:
130
+ model_size: 384
131
+ warmup_steps: 4000
132
  token_list:
133
  - <blank>
134
  - <unk>
 
226
  win_length: null
227
  normalize: global_mvn
228
  normalize_conf:
229
+ stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
230
+ tts: fastspeech
231
  tts_conf:
232
+ adim: 384
233
+ aheads: 2
234
+ elayers: 6
235
+ eunits: 1536
236
+ dlayers: 6
237
+ dunits: 1536
238
+ positionwise_layer_type: conv1d
239
+ positionwise_conv_kernel_size: 3
240
+ duration_predictor_layers: 2
241
+ duration_predictor_chans: 384
242
+ duration_predictor_kernel_size: 3
 
 
 
 
243
  postnet_layers: 5
 
244
  postnet_filts: 5
245
+ postnet_chans: 256
 
 
 
 
 
 
 
246
  use_masking: true
247
+ use_scaled_pos_enc: true
248
+ encoder_normalize_before: true
249
+ decoder_normalize_before: true
250
+ reduction_factor: 1
251
+ init_type: xavier_uniform
252
+ init_enc_alpha: 1.0
253
+ init_dec_alpha: 1.0
254
+ transformer_enc_dropout_rate: 0.1
255
+ transformer_enc_positional_dropout_rate: 0.1
256
+ transformer_enc_attn_dropout_rate: 0.1
257
+ transformer_dec_dropout_rate: 0.1
258
+ transformer_dec_positional_dropout_rate: 0.1
259
+ transformer_dec_attn_dropout_rate: 0.1
260
  pitch_extract: null
261
  pitch_extract_conf: {}
262
  pitch_normalize: null
 
270
  - token_list
271
  distributed: false
272
 
273
+
274
  ```
275
 
276
  </details>