Takaaki-Saeki commited on
Commit
cdc16ba
1 Parent(s): 6108aa3

Update model

Browse files
README.md ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - m_ailabs
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `saefro991/m-ailabs_en-us_judy_phn_tacotron2`
15
+
16
+ This model was trained by Takaaki-Saeki using m_ailabs recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ Follow the [ESPnet installation instructions](https://espnet.github.io/espnet/installation.html)
21
+ if you haven't done that already.
22
+
23
+ ```bash
24
+ cd espnet
25
+ git checkout 32b0f75b4491b71e88deac62bfc431cfcc9d7143
26
+ pip install -e .
27
+ cd egs2/m_ailabs/tts1
28
+ ./run.sh --skip_data_prep false --skip_train true --download_model saefro991/m-ailabs_en-us_judy_phn_tacotron2
29
+ ```
30
+
31
+
32
+
33
+ ## TTS config
34
+
35
+ <details><summary>expand</summary>
36
+
37
+ ```
38
+ config: conf/train.yaml
39
+ print_config: false
40
+ log_level: INFO
41
+ dry_run: false
42
+ iterator_type: sequence
43
+ output_dir: exp/tts_train_raw_phn_tacotron_g2p_en
44
+ ngpu: 1
45
+ seed: 0
46
+ num_workers: 1
47
+ num_att_plot: 3
48
+ dist_backend: nccl
49
+ dist_init_method: env://
50
+ dist_world_size: null
51
+ dist_rank: null
52
+ local_rank: 0
53
+ dist_master_addr: null
54
+ dist_master_port: null
55
+ dist_launcher: null
56
+ multiprocessing_distributed: false
57
+ unused_parameters: false
58
+ sharded_ddp: false
59
+ cudnn_enabled: true
60
+ cudnn_benchmark: false
61
+ cudnn_deterministic: true
62
+ collect_stats: false
63
+ write_collected_feats: false
64
+ max_epoch: 200
65
+ patience: null
66
+ val_scheduler_criterion:
67
+ - valid
68
+ - loss
69
+ early_stopping_criterion:
70
+ - valid
71
+ - loss
72
+ - min
73
+ best_model_criterion:
74
+ - - valid
75
+ - loss
76
+ - min
77
+ - - train
78
+ - loss
79
+ - min
80
+ keep_nbest_models: 5
81
+ nbest_averaging_interval: 0
82
+ grad_clip: 1.0
83
+ grad_clip_type: 2.0
84
+ grad_noise: false
85
+ accum_grad: 1
86
+ no_forward_run: false
87
+ resume: true
88
+ train_dtype: float32
89
+ use_amp: false
90
+ log_interval: null
91
+ use_matplotlib: true
92
+ use_tensorboard: true
93
+ create_graph_in_tensorboard: false
94
+ use_wandb: false
95
+ wandb_project: null
96
+ wandb_id: null
97
+ wandb_entity: null
98
+ wandb_name: null
99
+ wandb_model_log_interval: -1
100
+ detect_anomaly: false
101
+ pretrain_path: null
102
+ init_param: []
103
+ ignore_init_mismatch: false
104
+ freeze_param: []
105
+ num_iters_per_epoch: 500
106
+ batch_size: 20
107
+ valid_batch_size: null
108
+ batch_bins: 5120000
109
+ valid_batch_bins: null
110
+ train_shape_file:
111
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/train/text_shape.phn
112
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/train/speech_shape
113
+ valid_shape_file:
114
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/valid/text_shape.phn
115
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/valid/speech_shape
116
+ batch_type: numel
117
+ valid_batch_type: null
118
+ fold_length:
119
+ - 150
120
+ - 204800
121
+ sort_in_batch: descending
122
+ sort_batch: descending
123
+ multiple_iterator: false
124
+ chunk_length: 500
125
+ chunk_shift_ratio: 0.5
126
+ num_cache_chunks: 1024
127
+ train_data_path_and_name_and_type:
128
+ - - dump/raw/tr_no_dev/text
129
+ - text
130
+ - text
131
+ - - dump/raw/tr_no_dev/wav.scp
132
+ - speech
133
+ - sound
134
+ valid_data_path_and_name_and_type:
135
+ - - dump/raw/dev/text
136
+ - text
137
+ - text
138
+ - - dump/raw/dev/wav.scp
139
+ - speech
140
+ - sound
141
+ allow_variable_data_keys: false
142
+ max_cache_size: 0.0
143
+ max_cache_fd: 32
144
+ valid_max_cache_size: null
145
+ optim: adam
146
+ optim_conf:
147
+ lr: 0.001
148
+ eps: 1.0e-06
149
+ weight_decay: 0.0
150
+ scheduler: null
151
+ scheduler_conf: {}
152
+ token_list:
153
+ - <blank>
154
+ - <unk>
155
+ - ''
156
+ - AH0
157
+ - T
158
+ - N
159
+ - D
160
+ - R
161
+ - S
162
+ - L
163
+ - DH
164
+ - IH1
165
+ - K
166
+ - EH1
167
+ - M
168
+ - Z
169
+ - AE1
170
+ - W
171
+ - IH0
172
+ - AH1
173
+ - ','
174
+ - B
175
+ - IY1
176
+ - ER0
177
+ - UW1
178
+ - P
179
+ - HH
180
+ - AY1
181
+ - F
182
+ - V
183
+ - AA1
184
+ - AO1
185
+ - .
186
+ - EY1
187
+ - IY0
188
+ - OW1
189
+ - NG
190
+ - G
191
+ - Y
192
+ - AW1
193
+ - SH
194
+ - CH
195
+ - ER1
196
+ - UH1
197
+ - TH
198
+ - JH
199
+ - OW0
200
+ - OY1
201
+ - '?'
202
+ - '!'
203
+ - EH0
204
+ - EY2
205
+ - IH2
206
+ - ''''
207
+ - AY2
208
+ - AA0
209
+ - EH2
210
+ - UW0
211
+ - AA2
212
+ - AH2
213
+ - AE0
214
+ - OW2
215
+ - AO2
216
+ - UW2
217
+ - AE2
218
+ - ZH
219
+ - AW2
220
+ - AY0
221
+ - IY2
222
+ - AO0
223
+ - UH0
224
+ - UH2
225
+ - OY2
226
+ - AW0
227
+ - ER2
228
+ - EY0
229
+ - OY0
230
+ - <sos/eos>
231
+ odim: null
232
+ model_conf: {}
233
+ use_preprocessor: true
234
+ token_type: phn
235
+ bpemodel: null
236
+ non_linguistic_symbols: null
237
+ cleaner: tacotron
238
+ g2p: g2p_en
239
+ feats_extract: fbank
240
+ feats_extract_conf:
241
+ n_fft: 1024
242
+ hop_length: 256
243
+ win_length: null
244
+ fs: 16000
245
+ fmin: 80
246
+ fmax: 7600
247
+ n_mels: 80
248
+ normalize: global_mvn
249
+ normalize_conf:
250
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en/train/feats_stats.npz
251
+ tts: tacotron2
252
+ tts_conf:
253
+ embed_dim: 512
254
+ elayers: 1
255
+ eunits: 512
256
+ econv_layers: 3
257
+ econv_chans: 512
258
+ econv_filts: 5
259
+ atype: location
260
+ adim: 512
261
+ aconv_chans: 32
262
+ aconv_filts: 15
263
+ cumulate_att_w: true
264
+ dlayers: 2
265
+ dunits: 1024
266
+ prenet_layers: 2
267
+ prenet_units: 256
268
+ postnet_layers: 5
269
+ postnet_chans: 512
270
+ postnet_filts: 5
271
+ output_activation: null
272
+ use_batch_norm: true
273
+ use_concate: true
274
+ use_residual: false
275
+ dropout_rate: 0.5
276
+ zoneout_rate: 0.1
277
+ reduction_factor: 1
278
+ spk_embed_dim: null
279
+ use_masking: true
280
+ bce_pos_weight: 5.0
281
+ use_guided_attn_loss: true
282
+ guided_attn_loss_sigma: 0.4
283
+ guided_attn_loss_lambda: 1.0
284
+ pitch_extract: null
285
+ pitch_extract_conf: {}
286
+ pitch_normalize: null
287
+ pitch_normalize_conf: {}
288
+ energy_extract: null
289
+ energy_extract_conf: {}
290
+ energy_normalize: null
291
+ energy_normalize_conf: {}
292
+ required:
293
+ - output_dir
294
+ - token_list
295
+ version: '202209'
296
+ distributed: false
297
+ ```
298
+
299
+ </details>
300
+
301
+
302
+
303
+ ### Citing ESPnet
304
+
305
+ ```BibTex
306
+ @inproceedings{watanabe2018espnet,
307
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
308
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
309
+ year={2018},
310
+ booktitle={Proceedings of Interspeech},
311
+ pages={2207--2211},
312
+ doi={10.21437/Interspeech.2018-1456},
313
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
314
+ }
315
+
316
+
317
+
318
+
319
+ @inproceedings{hayashi2020espnet,
320
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
321
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
322
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
323
+ pages={7654--7658},
324
+ year={2020},
325
+ organization={IEEE}
326
+ }
327
+ ```
328
+
329
+ or arXiv:
330
+
331
+ ```bibtex
332
+ @misc{watanabe2018espnet,
333
+ title={ESPnet: End-to-End Speech Processing Toolkit},
334
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
335
+ year={2018},
336
+ eprint={1804.00015},
337
+ archivePrefix={arXiv},
338
+ primaryClass={cs.CL}
339
+ }
340
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15200210e6b236ce3060d6e36a277bd8d223bca70bc06a52aa58da0ec51a7c9e
3
+ size 1402
exp/tts_train_raw_phn_tacotron_g2p_en/59epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2dcff3c905a992710847220b084a17ec50ef25b1b87b0da038b27a22cfa615a9
3
+ size 106857729
exp/tts_train_raw_phn_tacotron_g2p_en/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_raw_phn_tacotron_g2p_en
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ create_graph_in_tensorboard: false
57
+ use_wandb: false
58
+ wandb_project: null
59
+ wandb_id: null
60
+ wandb_entity: null
61
+ wandb_name: null
62
+ wandb_model_log_interval: -1
63
+ detect_anomaly: false
64
+ pretrain_path: null
65
+ init_param: []
66
+ ignore_init_mismatch: false
67
+ freeze_param: []
68
+ num_iters_per_epoch: 500
69
+ batch_size: 20
70
+ valid_batch_size: null
71
+ batch_bins: 5120000
72
+ valid_batch_bins: null
73
+ train_shape_file:
74
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/train/text_shape.phn
75
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/train/speech_shape
76
+ valid_shape_file:
77
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/valid/text_shape.phn
78
+ - exp/tts_stats_raw_phn_tacotron_g2p_en/valid/speech_shape
79
+ batch_type: numel
80
+ valid_batch_type: null
81
+ fold_length:
82
+ - 150
83
+ - 204800
84
+ sort_in_batch: descending
85
+ sort_batch: descending
86
+ multiple_iterator: false
87
+ chunk_length: 500
88
+ chunk_shift_ratio: 0.5
89
+ num_cache_chunks: 1024
90
+ train_data_path_and_name_and_type:
91
+ - - dump/raw/tr_no_dev/text
92
+ - text
93
+ - text
94
+ - - dump/raw/tr_no_dev/wav.scp
95
+ - speech
96
+ - sound
97
+ valid_data_path_and_name_and_type:
98
+ - - dump/raw/dev/text
99
+ - text
100
+ - text
101
+ - - dump/raw/dev/wav.scp
102
+ - speech
103
+ - sound
104
+ allow_variable_data_keys: false
105
+ max_cache_size: 0.0
106
+ max_cache_fd: 32
107
+ valid_max_cache_size: null
108
+ optim: adam
109
+ optim_conf:
110
+ lr: 0.001
111
+ eps: 1.0e-06
112
+ weight_decay: 0.0
113
+ scheduler: null
114
+ scheduler_conf: {}
115
+ token_list:
116
+ - <blank>
117
+ - <unk>
118
+ - ''
119
+ - AH0
120
+ - T
121
+ - N
122
+ - D
123
+ - R
124
+ - S
125
+ - L
126
+ - DH
127
+ - IH1
128
+ - K
129
+ - EH1
130
+ - M
131
+ - Z
132
+ - AE1
133
+ - W
134
+ - IH0
135
+ - AH1
136
+ - ','
137
+ - B
138
+ - IY1
139
+ - ER0
140
+ - UW1
141
+ - P
142
+ - HH
143
+ - AY1
144
+ - F
145
+ - V
146
+ - AA1
147
+ - AO1
148
+ - .
149
+ - EY1
150
+ - IY0
151
+ - OW1
152
+ - NG
153
+ - G
154
+ - Y
155
+ - AW1
156
+ - SH
157
+ - CH
158
+ - ER1
159
+ - UH1
160
+ - TH
161
+ - JH
162
+ - OW0
163
+ - OY1
164
+ - '?'
165
+ - '!'
166
+ - EH0
167
+ - EY2
168
+ - IH2
169
+ - ''''
170
+ - AY2
171
+ - AA0
172
+ - EH2
173
+ - UW0
174
+ - AA2
175
+ - AH2
176
+ - AE0
177
+ - OW2
178
+ - AO2
179
+ - UW2
180
+ - AE2
181
+ - ZH
182
+ - AW2
183
+ - AY0
184
+ - IY2
185
+ - AO0
186
+ - UH0
187
+ - UH2
188
+ - OY2
189
+ - AW0
190
+ - ER2
191
+ - EY0
192
+ - OY0
193
+ - <sos/eos>
194
+ odim: null
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: tacotron
201
+ g2p: g2p_en
202
+ feats_extract: fbank
203
+ feats_extract_conf:
204
+ n_fft: 1024
205
+ hop_length: 256
206
+ win_length: null
207
+ fs: 16000
208
+ fmin: 80
209
+ fmax: 7600
210
+ n_mels: 80
211
+ normalize: global_mvn
212
+ normalize_conf:
213
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en/train/feats_stats.npz
214
+ tts: tacotron2
215
+ tts_conf:
216
+ embed_dim: 512
217
+ elayers: 1
218
+ eunits: 512
219
+ econv_layers: 3
220
+ econv_chans: 512
221
+ econv_filts: 5
222
+ atype: location
223
+ adim: 512
224
+ aconv_chans: 32
225
+ aconv_filts: 15
226
+ cumulate_att_w: true
227
+ dlayers: 2
228
+ dunits: 1024
229
+ prenet_layers: 2
230
+ prenet_units: 256
231
+ postnet_layers: 5
232
+ postnet_chans: 512
233
+ postnet_filts: 5
234
+ output_activation: null
235
+ use_batch_norm: true
236
+ use_concate: true
237
+ use_residual: false
238
+ dropout_rate: 0.5
239
+ zoneout_rate: 0.1
240
+ reduction_factor: 1
241
+ spk_embed_dim: null
242
+ use_masking: true
243
+ bce_pos_weight: 5.0
244
+ use_guided_attn_loss: true
245
+ guided_attn_loss_sigma: 0.4
246
+ guided_attn_loss_lambda: 1.0
247
+ pitch_extract: null
248
+ pitch_extract_conf: {}
249
+ pitch_normalize: null
250
+ pitch_normalize_conf: {}
251
+ energy_extract: null
252
+ energy_extract_conf: {}
253
+ energy_normalize: null
254
+ energy_normalize_conf: {}
255
+ required:
256
+ - output_dir
257
+ - token_list
258
+ version: '202209'
259
+ distributed: false
exp/tts_train_raw_phn_tacotron_g2p_en/images/attn_loss.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/backward_time.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/bce_loss.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/forward_time.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/iter_time.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/l1_loss.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/loss.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/mse_loss.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/optim0_lr0.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/optim_step_time.png ADDED
exp/tts_train_raw_phn_tacotron_g2p_en/images/train_time.png ADDED
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202209'
2
+ files:
3
+ model_file: exp/tts_train_raw_phn_tacotron_g2p_en/59epoch.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1665338336.572188
6
+ torch: 1.12.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_raw_phn_tacotron_g2p_en/config.yaml