GunnarThor commited on
Commit
0e9a570
1 Parent(s): d36968a

Update model

Browse files
README.md ADDED
@@ -0,0 +1,338 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - talromur
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `espnet/GunnarThor_talromur_b_tacotron2`
15
+
16
+ This model was trained by Gunnar Thor using talromur recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 49a284e69308d81c142b89795de255b4ce290c54
23
+ pip install -e .
24
+ cd egs2/talromur/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/GunnarThor_talromur_b_tacotron2
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: ./conf/tuning/train_tacotron2.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/b/tts_train_tacotron2_raw_phn_none
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: 2
48
+ dist_rank: 0
49
+ local_rank: 0
50
+ dist_master_addr: localhost
51
+ dist_master_port: 55403
52
+ dist_launcher: null
53
+ multiprocessing_distributed: true
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 100
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - train
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 5
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: 500
102
+ batch_size: 20
103
+ valid_batch_size: null
104
+ batch_bins: 2560000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/b/tts_stats_raw_phn_none/train/text_shape.phn
108
+ - exp/b/tts_stats_raw_phn_none/train/speech_shape
109
+ valid_shape_file:
110
+ - exp/b/tts_stats_raw_phn_none/valid/text_shape.phn
111
+ - exp/b/tts_stats_raw_phn_none/valid/speech_shape
112
+ batch_type: numel
113
+ valid_batch_type: null
114
+ fold_length:
115
+ - 150
116
+ - 204800
117
+ sort_in_batch: descending
118
+ sort_batch: descending
119
+ multiple_iterator: false
120
+ chunk_length: 500
121
+ chunk_shift_ratio: 0.5
122
+ num_cache_chunks: 1024
123
+ train_data_path_and_name_and_type:
124
+ - - dump/raw/train_b_phn/text
125
+ - text
126
+ - text
127
+ - - dump/raw/train_b_phn/wav.scp
128
+ - speech
129
+ - sound
130
+ valid_data_path_and_name_and_type:
131
+ - - dump/raw/dev_b_phn/text
132
+ - text
133
+ - text
134
+ - - dump/raw/dev_b_phn/wav.scp
135
+ - speech
136
+ - sound
137
+ allow_variable_data_keys: false
138
+ max_cache_size: 0.0
139
+ max_cache_fd: 32
140
+ valid_max_cache_size: null
141
+ optim: adam
142
+ optim_conf:
143
+ lr: 0.001
144
+ eps: 1.0e-06
145
+ weight_decay: 0.0
146
+ scheduler: null
147
+ scheduler_conf: {}
148
+ token_list:
149
+ - <blank>
150
+ - <unk>
151
+ - ','
152
+ - .
153
+ - r
154
+ - t
155
+ - n
156
+ - a0
157
+ - s
158
+ - I0
159
+ - D
160
+ - l
161
+ - Y0
162
+ - m
163
+ - v
164
+ - h
165
+ - E1
166
+ - k
167
+ - a:1
168
+ - j
169
+ - E:1
170
+ - f
171
+ - T
172
+ - G
173
+ - a1
174
+ - p
175
+ - c
176
+ - i:1
177
+ - au:1
178
+ - O:1
179
+ - E0
180
+ - I:1
181
+ - r_0
182
+ - t_h
183
+ - I1
184
+ - k_h
185
+ - Y1
186
+ - i0
187
+ - ei1
188
+ - u:1
189
+ - ou:1
190
+ - ei:1
191
+ - O1
192
+ - N
193
+ - l_0
194
+ - '91'
195
+ - n_0
196
+ - ou0
197
+ - ai0
198
+ - au1
199
+ - ou1
200
+ - O0
201
+ - '9:1'
202
+ - ai:1
203
+ - ei0
204
+ - ai1
205
+ - i1
206
+ - au0
207
+ - c_h
208
+ - p_h
209
+ - '90'
210
+ - C
211
+ - x
212
+ - u0
213
+ - 9i:1
214
+ - Y:1
215
+ - u1
216
+ - 9i1
217
+ - J
218
+ - N_0
219
+ - m_0
220
+ - 9i0
221
+ - J_0
222
+ - Oi1
223
+ - Yi0
224
+ - Yi1
225
+ - Oi0
226
+ - au:0
227
+ - '9:0'
228
+ - <sos/eos>
229
+ odim: null
230
+ model_conf: {}
231
+ use_preprocessor: true
232
+ token_type: phn
233
+ bpemodel: null
234
+ non_linguistic_symbols: null
235
+ cleaner: null
236
+ g2p: null
237
+ feats_extract: fbank
238
+ feats_extract_conf:
239
+ n_fft: 1024
240
+ hop_length: 256
241
+ win_length: null
242
+ fs: 22050
243
+ fmin: 80
244
+ fmax: 7600
245
+ n_mels: 80
246
+ normalize: global_mvn
247
+ normalize_conf:
248
+ stats_file: exp/b/tts_stats_raw_phn_none/train/feats_stats.npz
249
+ tts: tacotron2
250
+ tts_conf:
251
+ embed_dim: 512
252
+ elayers: 1
253
+ eunits: 512
254
+ econv_layers: 3
255
+ econv_chans: 512
256
+ econv_filts: 5
257
+ atype: location
258
+ adim: 512
259
+ aconv_chans: 32
260
+ aconv_filts: 15
261
+ cumulate_att_w: true
262
+ dlayers: 2
263
+ dunits: 1024
264
+ prenet_layers: 2
265
+ prenet_units: 256
266
+ postnet_layers: 5
267
+ postnet_chans: 512
268
+ postnet_filts: 5
269
+ output_activation: null
270
+ use_batch_norm: true
271
+ use_concate: true
272
+ use_residual: false
273
+ dropout_rate: 0.5
274
+ zoneout_rate: 0.1
275
+ reduction_factor: 1
276
+ spk_embed_dim: null
277
+ use_masking: true
278
+ bce_pos_weight: 5.0
279
+ use_guided_attn_loss: true
280
+ guided_attn_loss_sigma: 0.4
281
+ guided_attn_loss_lambda: 1.0
282
+ pitch_extract: null
283
+ pitch_extract_conf: {}
284
+ pitch_normalize: null
285
+ pitch_normalize_conf: {}
286
+ energy_extract: null
287
+ energy_extract_conf: {}
288
+ energy_normalize: null
289
+ energy_normalize_conf: {}
290
+ required:
291
+ - output_dir
292
+ - token_list
293
+ version: 0.10.7a1
294
+ distributed: true
295
+ ```
296
+
297
+ </details>
298
+
299
+
300
+
301
+ ### Citing ESPnet
302
+
303
+ ```BibTex
304
+ @inproceedings{watanabe2018espnet,
305
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
306
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
307
+ year={2018},
308
+ booktitle={Proceedings of Interspeech},
309
+ pages={2207--2211},
310
+ doi={10.21437/Interspeech.2018-1456},
311
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
312
+ }
313
+
314
+
315
+
316
+
317
+ @inproceedings{hayashi2020espnet,
318
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
319
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
320
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
321
+ pages={7654--7658},
322
+ year={2020},
323
+ organization={IEEE}
324
+ }
325
+ ```
326
+
327
+ or arXiv:
328
+
329
+ ```bibtex
330
+ @misc{watanabe2018espnet,
331
+ title={ESPnet: End-to-End Speech Processing Toolkit},
332
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
333
+ year={2018},
334
+ eprint={1804.00015},
335
+ archivePrefix={arXiv},
336
+ primaryClass={cs.CL}
337
+ }
338
+ ```
exp/b/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/b/tts_train_tacotron2_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/b/tts_train_tacotron2_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 2
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 55403
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: 500
68
+ batch_size: 20
69
+ valid_batch_size: null
70
+ batch_bins: 2560000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/b/tts_stats_raw_phn_none/train/text_shape.phn
74
+ - exp/b/tts_stats_raw_phn_none/train/speech_shape
75
+ valid_shape_file:
76
+ - exp/b/tts_stats_raw_phn_none/valid/text_shape.phn
77
+ - exp/b/tts_stats_raw_phn_none/valid/speech_shape
78
+ batch_type: numel
79
+ valid_batch_type: null
80
+ fold_length:
81
+ - 150
82
+ - 204800
83
+ sort_in_batch: descending
84
+ sort_batch: descending
85
+ multiple_iterator: false
86
+ chunk_length: 500
87
+ chunk_shift_ratio: 0.5
88
+ num_cache_chunks: 1024
89
+ train_data_path_and_name_and_type:
90
+ - - dump/raw/train_b_phn/text
91
+ - text
92
+ - text
93
+ - - dump/raw/train_b_phn/wav.scp
94
+ - speech
95
+ - sound
96
+ valid_data_path_and_name_and_type:
97
+ - - dump/raw/dev_b_phn/text
98
+ - text
99
+ - text
100
+ - - dump/raw/dev_b_phn/wav.scp
101
+ - speech
102
+ - sound
103
+ allow_variable_data_keys: false
104
+ max_cache_size: 0.0
105
+ max_cache_fd: 32
106
+ valid_max_cache_size: null
107
+ optim: adam
108
+ optim_conf:
109
+ lr: 0.001
110
+ eps: 1.0e-06
111
+ weight_decay: 0.0
112
+ scheduler: null
113
+ scheduler_conf: {}
114
+ token_list:
115
+ - <blank>
116
+ - <unk>
117
+ - ','
118
+ - .
119
+ - r
120
+ - t
121
+ - n
122
+ - a0
123
+ - s
124
+ - I0
125
+ - D
126
+ - l
127
+ - Y0
128
+ - m
129
+ - v
130
+ - h
131
+ - E1
132
+ - k
133
+ - a:1
134
+ - j
135
+ - E:1
136
+ - f
137
+ - T
138
+ - G
139
+ - a1
140
+ - p
141
+ - c
142
+ - i:1
143
+ - au:1
144
+ - O:1
145
+ - E0
146
+ - I:1
147
+ - r_0
148
+ - t_h
149
+ - I1
150
+ - k_h
151
+ - Y1
152
+ - i0
153
+ - ei1
154
+ - u:1
155
+ - ou:1
156
+ - ei:1
157
+ - O1
158
+ - N
159
+ - l_0
160
+ - '91'
161
+ - n_0
162
+ - ou0
163
+ - ai0
164
+ - au1
165
+ - ou1
166
+ - O0
167
+ - '9:1'
168
+ - ai:1
169
+ - ei0
170
+ - ai1
171
+ - i1
172
+ - au0
173
+ - c_h
174
+ - p_h
175
+ - '90'
176
+ - C
177
+ - x
178
+ - u0
179
+ - 9i:1
180
+ - Y:1
181
+ - u1
182
+ - 9i1
183
+ - J
184
+ - N_0
185
+ - m_0
186
+ - 9i0
187
+ - J_0
188
+ - Oi1
189
+ - Yi0
190
+ - Yi1
191
+ - Oi0
192
+ - au:0
193
+ - '9:0'
194
+ - <sos/eos>
195
+ odim: null
196
+ model_conf: {}
197
+ use_preprocessor: true
198
+ token_type: phn
199
+ bpemodel: null
200
+ non_linguistic_symbols: null
201
+ cleaner: null
202
+ g2p: null
203
+ feats_extract: fbank
204
+ feats_extract_conf:
205
+ n_fft: 1024
206
+ hop_length: 256
207
+ win_length: null
208
+ fs: 22050
209
+ fmin: 80
210
+ fmax: 7600
211
+ n_mels: 80
212
+ normalize: global_mvn
213
+ normalize_conf:
214
+ stats_file: exp/b/tts_stats_raw_phn_none/train/feats_stats.npz
215
+ tts: tacotron2
216
+ tts_conf:
217
+ embed_dim: 512
218
+ elayers: 1
219
+ eunits: 512
220
+ econv_layers: 3
221
+ econv_chans: 512
222
+ econv_filts: 5
223
+ atype: location
224
+ adim: 512
225
+ aconv_chans: 32
226
+ aconv_filts: 15
227
+ cumulate_att_w: true
228
+ dlayers: 2
229
+ dunits: 1024
230
+ prenet_layers: 2
231
+ prenet_units: 256
232
+ postnet_layers: 5
233
+ postnet_chans: 512
234
+ postnet_filts: 5
235
+ output_activation: null
236
+ use_batch_norm: true
237
+ use_concate: true
238
+ use_residual: false
239
+ dropout_rate: 0.5
240
+ zoneout_rate: 0.1
241
+ reduction_factor: 1
242
+ spk_embed_dim: null
243
+ use_masking: true
244
+ bce_pos_weight: 5.0
245
+ use_guided_attn_loss: true
246
+ guided_attn_loss_sigma: 0.4
247
+ guided_attn_loss_lambda: 1.0
248
+ pitch_extract: null
249
+ pitch_extract_conf: {}
250
+ pitch_normalize: null
251
+ pitch_normalize_conf: {}
252
+ energy_extract: null
253
+ energy_extract_conf: {}
254
+ energy_normalize: null
255
+ energy_normalize_conf: {}
256
+ required:
257
+ - output_dir
258
+ - token_list
259
+ version: 0.10.7a1
260
+ distributed: true
exp/b/tts_train_tacotron2_raw_phn_none/images/attn_loss.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/backward_time.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/bce_loss.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/forward_time.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/iter_time.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/l1_loss.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/loss.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/mse_loss.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/optim0_lr0.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/optim_step_time.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/images/train_time.png ADDED
exp/b/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f293fe30fc512eaf0f41ca0cf2749f6b9a99b1f2602865732f76bad29dff603
3
+ size 106861932
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/b/tts_train_tacotron2_raw_phn_none/train.loss.ave_5best.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1650384540.219646
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/b/tts_train_tacotron2_raw_phn_none/config.yaml