Roh commited on
Commit
ac55289
1 Parent(s): f6455e0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +46 -48
README.md CHANGED
@@ -12,7 +12,7 @@ widget:
12
  ---
13
  ## RyanSpeech model (based on ESPnet2)
14
 
15
- ### `espnet/english_ryanspeech_fastspeech`
16
  This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
 
@@ -24,12 +24,12 @@ You can download RyanSpeech dataset from [here](https://www.kaggle.com/datasets/
24
  <details><summary>expand</summary>
25
 
26
  ```
27
- config: conf/tuning/train_fastspeech.yaml
28
  print_config: false
29
  log_level: INFO
30
  dry_run: false
31
  iterator_type: sequence
32
- output_dir: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space
33
  ngpu: 1
34
  seed: 0
35
  num_workers: 1
@@ -48,7 +48,7 @@ cudnn_benchmark: false
48
  cudnn_deterministic: true
49
  collect_stats: false
50
  write_collected_feats: false
51
- max_epoch: 1000
52
  patience: null
53
  val_scheduler_criterion:
54
  - valid
@@ -68,7 +68,7 @@ keep_nbest_models: 5
68
  grad_clip: 1.0
69
  grad_clip_type: 2.0
70
  grad_noise: false
71
- accum_grad: 6
72
  no_forward_run: false
73
  resume: true
74
  train_dtype: float32
@@ -79,14 +79,14 @@ pretrain_key: []
79
  num_iters_per_epoch: 500
80
  batch_size: 20
81
  valid_batch_size: null
82
- batch_bins: 800000
83
  valid_batch_bins: null
84
  train_shape_file:
85
- - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/text_shape.phn
86
- - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/speech_shape
87
  valid_shape_file:
88
- - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/text_shape.phn
89
- - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/valid/speech_shape
90
  batch_type: numel
91
  valid_batch_type: null
92
  fold_length:
@@ -102,9 +102,6 @@ train_data_path_and_name_and_type:
102
  - - dump/raw/tr_no_dev/text
103
  - text
104
  - text
105
- - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//tr_no_dev/durations
106
- - durations
107
- - text_int
108
  - - dump/raw/tr_no_dev/wav.scp
109
  - speech
110
  - sound
@@ -112,9 +109,6 @@ valid_data_path_and_name_and_type:
112
  - - dump/raw/dev/text
113
  - text
114
  - text
115
- - - exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best//dev/durations
116
- - durations
117
- - text_int
118
  - - dump/raw/dev/wav.scp
119
  - speech
120
  - sound
@@ -124,11 +118,11 @@ max_cache_fd: 32
124
  valid_max_cache_size: null
125
  optim: adam
126
  optim_conf:
127
- lr: 1.0
128
- scheduler: noamlr
129
- scheduler_conf:
130
- model_size: 384
131
- warmup_steps: 4000
132
  token_list:
133
  - <blank>
134
  - <unk>
@@ -226,37 +220,40 @@ feats_extract_conf:
226
  win_length: null
227
  normalize: global_mvn
228
  normalize_conf:
229
- stats_file: exp/tts_train_raw_phn_tacotron_g2p_en_no_space/decode_use_teacher_forcingtrue_train.loss.best/stats/train/feats_stats.npz
230
- tts: fastspeech
231
  tts_conf:
232
- adim: 384
233
- aheads: 2
234
- elayers: 6
235
- eunits: 1536
236
- dlayers: 6
237
- dunits: 1536
238
- positionwise_layer_type: conv1d
239
- positionwise_conv_kernel_size: 3
240
- duration_predictor_layers: 2
241
- duration_predictor_chans: 384
242
- duration_predictor_kernel_size: 3
 
 
 
 
243
  postnet_layers: 5
 
244
  postnet_filts: 5
245
- postnet_chans: 256
246
- use_masking: true
247
- use_scaled_pos_enc: true
248
- encoder_normalize_before: true
249
- decoder_normalize_before: true
 
250
  reduction_factor: 1
251
- init_type: xavier_uniform
252
- init_enc_alpha: 1.0
253
- init_dec_alpha: 1.0
254
- transformer_enc_dropout_rate: 0.1
255
- transformer_enc_positional_dropout_rate: 0.1
256
- transformer_enc_attn_dropout_rate: 0.1
257
- transformer_dec_dropout_rate: 0.1
258
- transformer_dec_positional_dropout_rate: 0.1
259
- transformer_dec_attn_dropout_rate: 0.1
260
  pitch_extract: null
261
  pitch_extract_conf: {}
262
  pitch_normalize: null
@@ -269,6 +266,7 @@ required:
269
  - output_dir
270
  - token_list
271
  distributed: false
 
272
  ```
273
 
274
  </details>
 
12
  ---
13
  ## RyanSpeech model (based on ESPnet2)
14
 
15
+ ### `espnet/english_male_ryanspeech_tacotron`
16
  This model was trained by Rohola Zandie using ryanspeech recipe in [espnet](https://github.com/espnet/espnet/).
17
 
18
 
 
24
  <details><summary>expand</summary>
25
 
26
  ```
27
+ config: conf/train.yaml
28
  print_config: false
29
  log_level: INFO
30
  dry_run: false
31
  iterator_type: sequence
32
+ output_dir: exp/tts_train_raw_phn_tacotron_g2p_en_no_space
33
  ngpu: 1
34
  seed: 0
35
  num_workers: 1
 
48
  cudnn_deterministic: true
49
  collect_stats: false
50
  write_collected_feats: false
51
+ max_epoch: 200
52
  patience: null
53
  val_scheduler_criterion:
54
  - valid
 
68
  grad_clip: 1.0
69
  grad_clip_type: 2.0
70
  grad_noise: false
71
+ accum_grad: 1
72
  no_forward_run: false
73
  resume: true
74
  train_dtype: float32
 
79
  num_iters_per_epoch: 500
80
  batch_size: 20
81
  valid_batch_size: null
82
+ batch_bins: 5120000
83
  valid_batch_bins: null
84
  train_shape_file:
85
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
86
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
87
  valid_shape_file:
88
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
89
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
90
  batch_type: numel
91
  valid_batch_type: null
92
  fold_length:
 
102
  - - dump/raw/tr_no_dev/text
103
  - text
104
  - text
 
 
 
105
  - - dump/raw/tr_no_dev/wav.scp
106
  - speech
107
  - sound
 
109
  - - dump/raw/dev/text
110
  - text
111
  - text
 
 
 
112
  - - dump/raw/dev/wav.scp
113
  - speech
114
  - sound
 
118
  valid_max_cache_size: null
119
  optim: adam
120
  optim_conf:
121
+ lr: 0.001
122
+ eps: 1.0e-06
123
+ weight_decay: 0.0
124
+ scheduler: null
125
+ scheduler_conf: {}
126
  token_list:
127
  - <blank>
128
  - <unk>
 
220
  win_length: null
221
  normalize: global_mvn
222
  normalize_conf:
223
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
224
+ tts: tacotron2
225
  tts_conf:
226
+ embed_dim: 512
227
+ elayers: 1
228
+ eunits: 512
229
+ econv_layers: 3
230
+ econv_chans: 512
231
+ econv_filts: 5
232
+ atype: location
233
+ adim: 512
234
+ aconv_chans: 32
235
+ aconv_filts: 15
236
+ cumulate_att_w: true
237
+ dlayers: 2
238
+ dunits: 1024
239
+ prenet_layers: 2
240
+ prenet_units: 256
241
  postnet_layers: 5
242
+ postnet_chans: 512
243
  postnet_filts: 5
244
+ output_activation: null
245
+ use_batch_norm: true
246
+ use_concate: true
247
+ use_residual: false
248
+ dropout_rate: 0.5
249
+ zoneout_rate: 0.1
250
  reduction_factor: 1
251
+ spk_embed_dim: null
252
+ use_masking: true
253
+ bce_pos_weight: 5.0
254
+ use_guided_attn_loss: true
255
+ guided_attn_loss_sigma: 0.4
256
+ guided_attn_loss_lambda: 1.0
 
 
 
257
  pitch_extract: null
258
  pitch_extract_conf: {}
259
  pitch_normalize: null
 
266
  - output_dir
267
  - token_list
268
  distributed: false
269
+
270
  ```
271
 
272
  </details>