Text-to-Speech
ESPnet
English
audio
GunnarThor commited on
Commit
941849a
1 Parent(s): fde1cc1

Update model

Browse files
README.md ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - talromur
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `espnet/GunnarThor_talromur_h_tacotron2`
15
+
16
+ This model was trained by Gunnar Thor using talromur recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 49a284e69308d81c142b89795de255b4ce290c54
23
+ pip install -e .
24
+ cd egs2/talromur/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/GunnarThor_talromur_h_tacotron2
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: ./conf/tuning/train_tacotron2.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/h/tts_train_tacotron2_raw_phn_none
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 2
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 54941
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 100
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - train
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 5
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: 500
102
+ batch_size: 20
103
+ valid_batch_size: null
104
+ batch_bins: 2560000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/h/tts_stats_raw_phn_none/train/text_shape.phn
108
+ - exp/h/tts_stats_raw_phn_none/train/speech_shape
109
+ valid_shape_file:
110
+ - exp/h/tts_stats_raw_phn_none/valid/text_shape.phn
111
+ - exp/h/tts_stats_raw_phn_none/valid/speech_shape
112
+ batch_type: numel
113
+ valid_batch_type: null
114
+ fold_length:
115
+ - 150
116
+ - 204800
117
+ sort_in_batch: descending
118
+ sort_batch: descending
119
+ multiple_iterator: false
120
+ chunk_length: 500
121
+ chunk_shift_ratio: 0.5
122
+ num_cache_chunks: 1024
123
+ train_data_path_and_name_and_type:
124
+ - - dump/raw/train_h_phn/text
125
+ - text
126
+ - text
127
+ - - dump/raw/train_h_phn/wav.scp
128
+ - speech
129
+ - sound
130
+ valid_data_path_and_name_and_type:
131
+ - - dump/raw/dev_h_phn/text
132
+ - text
133
+ - text
134
+ - - dump/raw/dev_h_phn/wav.scp
135
+ - speech
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.001
144
+ eps: 1.0e-06
145
+ weight_decay: 0.0
146
+ scheduler: null
147
+ scheduler_conf: {}
148
+ token_list:
149
+ - <blank>
150
+ - <unk>
151
+ - ','
152
+ - .
153
+ - r
154
+ - t
155
+ - n
156
+ - a0
157
+ - s
158
+ - I0
159
+ - D
160
+ - l
161
+ - Y0
162
+ - m
163
+ - v
164
+ - h
165
+ - E1
166
+ - k
167
+ - a:1
168
+ - E:1
169
+ - f
170
+ - G
171
+ - j
172
+ - T
173
+ - a1
174
+ - p
175
+ - c
176
+ - au:1
177
+ - i:1
178
+ - O:1
179
+ - I:1
180
+ - E0
181
+ - I1
182
+ - r_0
183
+ - t_h
184
+ - k_h
185
+ - Y1
186
+ - ei1
187
+ - i0
188
+ - ou:1
189
+ - ei:1
190
+ - u:1
191
+ - O1
192
+ - N
193
+ - l_0
194
+ - '91'
195
+ - ai0
196
+ - au1
197
+ - ou0
198
+ - n_0
199
+ - ei0
200
+ - O0
201
+ - ou1
202
+ - ai:1
203
+ - '9:1'
204
+ - ai1
205
+ - i1
206
+ - '90'
207
+ - au0
208
+ - c_h
209
+ - x
210
+ - 9i:1
211
+ - C
212
+ - p_h
213
+ - u0
214
+ - Y:1
215
+ - J
216
+ - 9i1
217
+ - u1
218
+ - 9i0
219
+ - N_0
220
+ - m_0
221
+ - J_0
222
+ - Oi1
223
+ - Yi0
224
+ - Yi1
225
+ - Oi0
226
+ - au:0
227
+ - '9:0'
228
+ - E:0
229
+ - <sos/eos>
230
+ odim: null
231
+ model_conf: {}
232
+ use_preprocessor: true
233
+ token_type: phn
234
+ bpemodel: null
235
+ non_linguistic_symbols: null
236
+ cleaner: null
237
+ g2p: null
238
+ feats_extract: fbank
239
+ feats_extract_conf:
240
+ n_fft: 1024
241
+ hop_length: 256
242
+ win_length: null
243
+ fs: 22050
244
+ fmin: 80
245
+ fmax: 7600
246
+ n_mels: 80
247
+ normalize: global_mvn
248
+ normalize_conf:
249
+ stats_file: exp/h/tts_stats_raw_phn_none/train/feats_stats.npz
250
+ tts: tacotron2
251
+ tts_conf:
252
+ embed_dim: 512
253
+ elayers: 1
254
+ eunits: 512
255
+ econv_layers: 3
256
+ econv_chans: 512
257
+ econv_filts: 5
258
+ atype: location
259
+ adim: 512
260
+ aconv_chans: 32
261
+ aconv_filts: 15
262
+ cumulate_att_w: true
263
+ dlayers: 2
264
+ dunits: 1024
265
+ prenet_layers: 2
266
+ prenet_units: 256
267
+ postnet_layers: 5
268
+ postnet_chans: 512
269
+ postnet_filts: 5
270
+ output_activation: null
271
+ use_batch_norm: true
272
+ use_concate: true
273
+ use_residual: false
274
+ dropout_rate: 0.5
275
+ zoneout_rate: 0.1
276
+ reduction_factor: 1
277
+ spk_embed_dim: null
278
+ use_masking: true
279
+ bce_pos_weight: 5.0
280
+ use_guided_attn_loss: true
281
+ guided_attn_loss_sigma: 0.4
282
+ guided_attn_loss_lambda: 1.0
283
+ pitch_extract: null
284
+ pitch_extract_conf: {}
285
+ pitch_normalize: null
286
+ pitch_normalize_conf: {}
287
+ energy_extract: null
288
+ energy_extract_conf: {}
289
+ energy_normalize: null
290
+ energy_normalize_conf: {}
291
+ required:
292
+ - output_dir
293
+ - token_list
294
+ version: 0.10.7a1
295
+ distributed: true
296
+ ```
297
+
298
+ </details>
299
+
300
+
301
+
302
+ ### Citing ESPnet
303
+
304
+ ```BibTex
305
+ @inproceedings{watanabe2018espnet,
306
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
307
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
308
+ year={2018},
309
+ booktitle={Proceedings of Interspeech},
310
+ pages={2207--2211},
311
+ doi={10.21437/Interspeech.2018-1456},
312
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
313
+ }
314
+
315
+
316
+
317
+
318
+ @inproceedings{hayashi2020espnet,
319
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
320
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
321
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
322
+ pages={7654--7658},
323
+ year={2020},
324
+ organization={IEEE}
325
+ }
326
+ ```
327
+
328
+ or arXiv:
329
+
330
+ ```bibtex
331
+ @misc{watanabe2018espnet,
332
+ title={ESPnet: End-to-End Speech Processing Toolkit},
333
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
334
+ year={2018},
335
+ eprint={1804.00015},
336
+ archivePrefix={arXiv},
337
+ primaryClass={cs.CL}
338
+ }
339
+ ```
exp/h/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/h/tts_train_tacotron2_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,261 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/h/tts_train_tacotron2_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 54941
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: 500
68
+ batch_size: 20
69
+ valid_batch_size: null
70
+ batch_bins: 2560000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/h/tts_stats_raw_phn_none/train/text_shape.phn
74
+ - exp/h/tts_stats_raw_phn_none/train/speech_shape
75
+ valid_shape_file:
76
+ - exp/h/tts_stats_raw_phn_none/valid/text_shape.phn
77
+ - exp/h/tts_stats_raw_phn_none/valid/speech_shape
78
+ batch_type: numel
79
+ valid_batch_type: null
80
+ fold_length:
81
+ - 150
82
+ - 204800
83
+ sort_in_batch: descending
84
+ sort_batch: descending
85
+ multiple_iterator: false
86
+ chunk_length: 500
87
+ chunk_shift_ratio: 0.5
88
+ num_cache_chunks: 1024
89
+ train_data_path_and_name_and_type:
90
+ - - dump/raw/train_h_phn/text
91
+ - text
92
+ - text
93
+ - - dump/raw/train_h_phn/wav.scp
94
+ - speech
95
+ - sound
96
+ valid_data_path_and_name_and_type:
97
+ - - dump/raw/dev_h_phn/text
98
+ - text
99
+ - text
100
+ - - dump/raw/dev_h_phn/wav.scp
101
+ - speech
102
+ - sound
103
+ allow_variable_data_keys: false
104
+ max_cache_size: 0.0
105
+ max_cache_fd: 32
106
+ valid_max_cache_size: null
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - ','
118
+ - .
119
+ - r
120
+ - t
121
+ - n
122
+ - a0
123
+ - s
124
+ - I0
125
+ - D
126
+ - l
127
+ - Y0
128
+ - m
129
+ - v
130
+ - h
131
+ - E1
132
+ - k
133
+ - a:1
134
+ - E:1
135
+ - f
136
+ - G
137
+ - j
138
+ - T
139
+ - a1
140
+ - p
141
+ - c
142
+ - au:1
143
+ - i:1
144
+ - O:1
145
+ - I:1
146
+ - E0
147
+ - I1
148
+ - r_0
149
+ - t_h
150
+ - k_h
151
+ - Y1
152
+ - ei1
153
+ - i0
154
+ - ou:1
155
+ - ei:1
156
+ - u:1
157
+ - O1
158
+ - N
159
+ - l_0
160
+ - '91'
161
+ - ai0
162
+ - au1
163
+ - ou0
164
+ - n_0
165
+ - ei0
166
+ - O0
167
+ - ou1
168
+ - ai:1
169
+ - '9:1'
170
+ - ai1
171
+ - i1
172
+ - '90'
173
+ - au0
174
+ - c_h
175
+ - x
176
+ - 9i:1
177
+ - C
178
+ - p_h
179
+ - u0
180
+ - Y:1
181
+ - J
182
+ - 9i1
183
+ - u1
184
+ - 9i0
185
+ - N_0
186
+ - m_0
187
+ - J_0
188
+ - Oi1
189
+ - Yi0
190
+ - Yi1
191
+ - Oi0
192
+ - au:0
193
+ - '9:0'
194
+ - E:0
195
+ - <sos/eos>
196
+ odim: null
197
+ model_conf: {}
198
+ use_preprocessor: true
199
+ token_type: phn
200
+ bpemodel: null
201
+ non_linguistic_symbols: null
202
+ cleaner: null
203
+ g2p: null
204
+ feats_extract: fbank
205
+ feats_extract_conf:
206
+ n_fft: 1024
207
+ hop_length: 256
208
+ win_length: null
209
+ fs: 22050
210
+ fmin: 80
211
+ fmax: 7600
212
+ n_mels: 80
213
+ normalize: global_mvn
214
+ normalize_conf:
215
+ stats_file: exp/h/tts_stats_raw_phn_none/train/feats_stats.npz
216
+ tts: tacotron2
217
+ tts_conf:
218
+ embed_dim: 512
219
+ elayers: 1
220
+ eunits: 512
221
+ econv_layers: 3
222
+ econv_chans: 512
223
+ econv_filts: 5
224
+ atype: location
225
+ adim: 512
226
+ aconv_chans: 32
227
+ aconv_filts: 15
228
+ cumulate_att_w: true
229
+ dlayers: 2
230
+ dunits: 1024
231
+ prenet_layers: 2
232
+ prenet_units: 256
233
+ postnet_layers: 5
234
+ postnet_chans: 512
235
+ postnet_filts: 5
236
+ output_activation: null
237
+ use_batch_norm: true
238
+ use_concate: true
239
+ use_residual: false
240
+ dropout_rate: 0.5
241
+ zoneout_rate: 0.1
242
+ reduction_factor: 1
243
+ spk_embed_dim: null
244
+ use_masking: true
245
+ bce_pos_weight: 5.0
246
+ use_guided_attn_loss: true
247
+ guided_attn_loss_sigma: 0.4
248
+ guided_attn_loss_lambda: 1.0
249
+ pitch_extract: null
250
+ pitch_extract_conf: {}
251
+ pitch_normalize: null
252
+ pitch_normalize_conf: {}
253
+ energy_extract: null
254
+ energy_extract_conf: {}
255
+ energy_normalize: null
256
+ energy_normalize_conf: {}
257
+ required:
258
+ - output_dir
259
+ - token_list
260
+ version: 0.10.7a1
261
+ distributed: true
exp/h/tts_train_tacotron2_raw_phn_none/images/attn_loss.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/backward_time.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/bce_loss.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/forward_time.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/iter_time.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/l1_loss.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/loss.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/mse_loss.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/optim0_lr0.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/optim_step_time.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/images/train_time.png ADDED
exp/h/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59893dd09f702e12031ecfa044e04688bad776d23d2160a8961e6471cb66db77
3
+ size 106863980
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/h/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1650469076.076944
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/h/tts_train_tacotron2_raw_phn_none/config.yaml