Gunnar Thor commited on
Commit
de36f96
1 Parent(s): dde9958

Add model files

Browse files
README.md ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: is
7
+ datasets:
8
+ - talromur2
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### ``
15
+
16
+ This model was trained by using recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+
23
+ pip install -e .
24
+ cd talromur2/tts1/talromur2_xvector_tacotron2
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: ./conf/tuning/train_xvector_tacotron2.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/tts_train_xvector_tacotron2_raw_phn_none
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: null
48
+ dist_rank: null
49
+ local_rank: 0
50
+ dist_master_addr: null
51
+ dist_master_port: null
52
+ dist_launcher: null
53
+ multiprocessing_distributed: false
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 500
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - train
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 5
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 1
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: 500
102
+ batch_size: 20
103
+ valid_batch_size: null
104
+ batch_bins: 3750000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/tts_stats_raw_phn_none/train/text_shape.phn
108
+ - exp/tts_stats_raw_phn_none/train/speech_shape
109
+ valid_shape_file:
110
+ - exp/tts_stats_raw_phn_none/valid/text_shape.phn
111
+ - exp/tts_stats_raw_phn_none/valid/speech_shape
112
+ batch_type: numel
113
+ valid_batch_type: null
114
+ fold_length:
115
+ - 150
116
+ - 204800
117
+ sort_in_batch: descending
118
+ sort_batch: descending
119
+ multiple_iterator: false
120
+ chunk_length: 500
121
+ chunk_shift_ratio: 0.5
122
+ num_cache_chunks: 1024
123
+ train_data_path_and_name_and_type:
124
+ - - dump/raw/train_phn/text
125
+ - text
126
+ - text
127
+ - - dump/raw/train_phn/wav.scp
128
+ - speech
129
+ - sound
130
+ - - dump/xvector/train_phn/xvector.scp
131
+ - spembs
132
+ - kaldi_ark
133
+ valid_data_path_and_name_and_type:
134
+ - - dump/raw/dev_phn/text
135
+ - text
136
+ - text
137
+ - - dump/raw/dev_phn/wav.scp
138
+ - speech
139
+ - sound
140
+ - - dump/xvector/dev_phn/xvector.scp
141
+ - spembs
142
+ - kaldi_ark
143
+ allow_variable_data_keys: false
144
+ max_cache_size: 0.0
145
+ max_cache_fd: 32
146
+ valid_max_cache_size: null
147
+ optim: adam
148
+ optim_conf:
149
+ lr: 0.001
150
+ eps: 1.0e-06
151
+ weight_decay: 0.0
152
+ scheduler: null
153
+ scheduler_conf: {}
154
+ token_list:
155
+ - <blank>
156
+ - <unk>
157
+ - r
158
+ - a
159
+ - t
160
+ - I
161
+ - n
162
+ - s
163
+ - D
164
+ - Y
165
+ - E
166
+ - l
167
+ - v
168
+ - m
169
+ - h
170
+ - k
171
+ - 'a:'
172
+ - j
173
+ - 'E:'
174
+ - T
175
+ - f
176
+ - G
177
+ - p
178
+ - 'i:'
179
+ - 'au:'
180
+ - c
181
+ - 'O:'
182
+ - i
183
+ - r_0
184
+ - 'I:'
185
+ - t_h
186
+ - ei
187
+ - O
188
+ - k_h
189
+ - ou
190
+ - '9'
191
+ - 'u:'
192
+ - ai
193
+ - au
194
+ - 'ou:'
195
+ - u
196
+ - 'ei:'
197
+ - l_0
198
+ - N
199
+ - n_0
200
+ - '9:'
201
+ - p_h
202
+ - 'ai:'
203
+ - c_h
204
+ - 9i
205
+ - C
206
+ - '9i:'
207
+ - x
208
+ - 'Y:'
209
+ - N_0
210
+ - J
211
+ - m_0
212
+ - Yi
213
+ - Oi
214
+ - J_0
215
+ - <sos/eos>
216
+ odim: null
217
+ model_conf: {}
218
+ use_preprocessor: true
219
+ token_type: phn
220
+ bpemodel: null
221
+ non_linguistic_symbols: null
222
+ cleaner: null
223
+ g2p: null
224
+ feats_extract: fbank
225
+ feats_extract_conf:
226
+ n_fft: 1024
227
+ hop_length: 256
228
+ win_length: null
229
+ fs: 22050
230
+ fmin: 80
231
+ fmax: 7600
232
+ n_mels: 80
233
+ normalize: global_mvn
234
+ normalize_conf:
235
+ stats_file: exp/tts_stats_raw_phn_none/train/feats_stats.npz
236
+ tts: tacotron2
237
+ tts_conf:
238
+ embed_dim: 512
239
+ elayers: 1
240
+ eunits: 512
241
+ econv_layers: 3
242
+ econv_chans: 512
243
+ econv_filts: 5
244
+ atype: location
245
+ adim: 512
246
+ aconv_chans: 32
247
+ aconv_filts: 15
248
+ cumulate_att_w: true
249
+ dlayers: 2
250
+ dunits: 1024
251
+ prenet_layers: 2
252
+ prenet_units: 256
253
+ postnet_layers: 5
254
+ postnet_chans: 512
255
+ postnet_filts: 5
256
+ output_activation: null
257
+ use_batch_norm: true
258
+ use_concate: true
259
+ use_residual: false
260
+ spk_embed_dim: 512
261
+ spk_embed_integration_type: add
262
+ dropout_rate: 0.5
263
+ zoneout_rate: 0.1
264
+ reduction_factor: 1
265
+ use_masking: true
266
+ bce_pos_weight: 10.0
267
+ use_guided_attn_loss: true
268
+ guided_attn_loss_sigma: 0.4
269
+ guided_attn_loss_lambda: 1.0
270
+ pitch_extract: null
271
+ pitch_extract_conf: {}
272
+ pitch_normalize: null
273
+ pitch_normalize_conf: {}
274
+ energy_extract: null
275
+ energy_extract_conf: {}
276
+ energy_normalize: null
277
+ energy_normalize_conf: {}
278
+ required:
279
+ - output_dir
280
+ - token_list
281
+ version: '202204'
282
+ distributed: false
283
+ ```
284
+
285
+ </details>
286
+
287
+
288
+
289
+ ### Citing ESPnet
290
+
291
+ ```BibTex
292
+ @inproceedings{watanabe2018espnet,
293
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
294
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
295
+ year={2018},
296
+ booktitle={Proceedings of Interspeech},
297
+ pages={2207--2211},
298
+ doi={10.21437/Interspeech.2018-1456},
299
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
300
+ }
301
+
302
+
303
+
304
+
305
+ @inproceedings{hayashi2020espnet,
306
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
307
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
308
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
309
+ pages={7654--7658},
310
+ year={2020},
311
+ organization={IEEE}
312
+ }
313
+ ```
314
+
315
+ or arXiv:
316
+
317
+ ```bibtex
318
+ @misc{watanabe2018espnet,
319
+ title={ESPnet: End-to-End Speech Processing Toolkit},
320
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
321
+ year={2018},
322
+ eprint={1804.00015},
323
+ archivePrefix={arXiv},
324
+ primaryClass={cs.CL}
325
+ }
326
+ ```
dump/xvector/dev_phn/spk_xvector.ark ADDED
Binary file (82.5 kB). View file
 
dump/xvector/dev_phn/spk_xvector.scp ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s124 dump/xvector/dev_phn/spk_xvector.ark:5
2
+ s146 dump/xvector/dev_phn/spk_xvector.ark:2068
3
+ s157 dump/xvector/dev_phn/spk_xvector.ark:4131
4
+ s162 dump/xvector/dev_phn/spk_xvector.ark:6194
5
+ s169 dump/xvector/dev_phn/spk_xvector.ark:8257
6
+ s176 dump/xvector/dev_phn/spk_xvector.ark:10320
7
+ s178 dump/xvector/dev_phn/spk_xvector.ark:12383
8
+ s180 dump/xvector/dev_phn/spk_xvector.ark:14446
9
+ s181 dump/xvector/dev_phn/spk_xvector.ark:16509
10
+ s185 dump/xvector/dev_phn/spk_xvector.ark:18572
11
+ s186 dump/xvector/dev_phn/spk_xvector.ark:20635
12
+ s187 dump/xvector/dev_phn/spk_xvector.ark:22698
13
+ s188 dump/xvector/dev_phn/spk_xvector.ark:24761
14
+ s200 dump/xvector/dev_phn/spk_xvector.ark:26824
15
+ s206 dump/xvector/dev_phn/spk_xvector.ark:28887
16
+ s208 dump/xvector/dev_phn/spk_xvector.ark:30950
17
+ s209 dump/xvector/dev_phn/spk_xvector.ark:33013
18
+ s214 dump/xvector/dev_phn/spk_xvector.ark:35076
19
+ s215 dump/xvector/dev_phn/spk_xvector.ark:37139
20
+ s216 dump/xvector/dev_phn/spk_xvector.ark:39202
21
+ s220 dump/xvector/dev_phn/spk_xvector.ark:41265
22
+ s221 dump/xvector/dev_phn/spk_xvector.ark:43328
23
+ s222 dump/xvector/dev_phn/spk_xvector.ark:45391
24
+ s223 dump/xvector/dev_phn/spk_xvector.ark:47454
25
+ s225 dump/xvector/dev_phn/spk_xvector.ark:49517
26
+ s226 dump/xvector/dev_phn/spk_xvector.ark:51580
27
+ s228 dump/xvector/dev_phn/spk_xvector.ark:53643
28
+ s231 dump/xvector/dev_phn/spk_xvector.ark:55706
29
+ s234 dump/xvector/dev_phn/spk_xvector.ark:57769
30
+ s235 dump/xvector/dev_phn/spk_xvector.ark:59832
31
+ s236 dump/xvector/dev_phn/spk_xvector.ark:61895
32
+ s240 dump/xvector/dev_phn/spk_xvector.ark:63958
33
+ s247 dump/xvector/dev_phn/spk_xvector.ark:66021
34
+ s250 dump/xvector/dev_phn/spk_xvector.ark:68084
35
+ s251 dump/xvector/dev_phn/spk_xvector.ark:70147
36
+ s256 dump/xvector/dev_phn/spk_xvector.ark:72210
37
+ s258 dump/xvector/dev_phn/spk_xvector.ark:74273
38
+ s264 dump/xvector/dev_phn/spk_xvector.ark:76336
39
+ s268 dump/xvector/dev_phn/spk_xvector.ark:78399
40
+ s273 dump/xvector/dev_phn/spk_xvector.ark:80462
dump/xvector/eval1_phn/spk_xvector.ark ADDED
Binary file (82.5 kB). View file
 
dump/xvector/eval1_phn/spk_xvector.scp ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s124 dump/xvector/eval1_phn/spk_xvector.ark:5
2
+ s146 dump/xvector/eval1_phn/spk_xvector.ark:2068
3
+ s157 dump/xvector/eval1_phn/spk_xvector.ark:4131
4
+ s162 dump/xvector/eval1_phn/spk_xvector.ark:6194
5
+ s169 dump/xvector/eval1_phn/spk_xvector.ark:8257
6
+ s176 dump/xvector/eval1_phn/spk_xvector.ark:10320
7
+ s178 dump/xvector/eval1_phn/spk_xvector.ark:12383
8
+ s180 dump/xvector/eval1_phn/spk_xvector.ark:14446
9
+ s181 dump/xvector/eval1_phn/spk_xvector.ark:16509
10
+ s185 dump/xvector/eval1_phn/spk_xvector.ark:18572
11
+ s186 dump/xvector/eval1_phn/spk_xvector.ark:20635
12
+ s187 dump/xvector/eval1_phn/spk_xvector.ark:22698
13
+ s188 dump/xvector/eval1_phn/spk_xvector.ark:24761
14
+ s200 dump/xvector/eval1_phn/spk_xvector.ark:26824
15
+ s206 dump/xvector/eval1_phn/spk_xvector.ark:28887
16
+ s208 dump/xvector/eval1_phn/spk_xvector.ark:30950
17
+ s209 dump/xvector/eval1_phn/spk_xvector.ark:33013
18
+ s214 dump/xvector/eval1_phn/spk_xvector.ark:35076
19
+ s215 dump/xvector/eval1_phn/spk_xvector.ark:37139
20
+ s216 dump/xvector/eval1_phn/spk_xvector.ark:39202
21
+ s220 dump/xvector/eval1_phn/spk_xvector.ark:41265
22
+ s221 dump/xvector/eval1_phn/spk_xvector.ark:43328
23
+ s222 dump/xvector/eval1_phn/spk_xvector.ark:45391
24
+ s223 dump/xvector/eval1_phn/spk_xvector.ark:47454
25
+ s225 dump/xvector/eval1_phn/spk_xvector.ark:49517
26
+ s226 dump/xvector/eval1_phn/spk_xvector.ark:51580
27
+ s228 dump/xvector/eval1_phn/spk_xvector.ark:53643
28
+ s231 dump/xvector/eval1_phn/spk_xvector.ark:55706
29
+ s234 dump/xvector/eval1_phn/spk_xvector.ark:57769
30
+ s235 dump/xvector/eval1_phn/spk_xvector.ark:59832
31
+ s236 dump/xvector/eval1_phn/spk_xvector.ark:61895
32
+ s240 dump/xvector/eval1_phn/spk_xvector.ark:63958
33
+ s247 dump/xvector/eval1_phn/spk_xvector.ark:66021
34
+ s250 dump/xvector/eval1_phn/spk_xvector.ark:68084
35
+ s251 dump/xvector/eval1_phn/spk_xvector.ark:70147
36
+ s256 dump/xvector/eval1_phn/spk_xvector.ark:72210
37
+ s258 dump/xvector/eval1_phn/spk_xvector.ark:74273
38
+ s264 dump/xvector/eval1_phn/spk_xvector.ark:76336
39
+ s268 dump/xvector/eval1_phn/spk_xvector.ark:78399
40
+ s273 dump/xvector/eval1_phn/spk_xvector.ark:80462
dump/xvector/train_phn/spk_xvector.ark ADDED
Binary file (74.3 kB). View file
 
dump/xvector/train_phn/spk_xvector.scp ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ s169 dump/xvector/train_phn/spk_xvector.ark:5
2
+ s176 dump/xvector/train_phn/spk_xvector.ark:2068
3
+ s178 dump/xvector/train_phn/spk_xvector.ark:4131
4
+ s180 dump/xvector/train_phn/spk_xvector.ark:6194
5
+ s181 dump/xvector/train_phn/spk_xvector.ark:8257
6
+ s185 dump/xvector/train_phn/spk_xvector.ark:10320
7
+ s186 dump/xvector/train_phn/spk_xvector.ark:12383
8
+ s187 dump/xvector/train_phn/spk_xvector.ark:14446
9
+ s188 dump/xvector/train_phn/spk_xvector.ark:16509
10
+ s200 dump/xvector/train_phn/spk_xvector.ark:18572
11
+ s206 dump/xvector/train_phn/spk_xvector.ark:20635
12
+ s208 dump/xvector/train_phn/spk_xvector.ark:22698
13
+ s209 dump/xvector/train_phn/spk_xvector.ark:24761
14
+ s214 dump/xvector/train_phn/spk_xvector.ark:26824
15
+ s215 dump/xvector/train_phn/spk_xvector.ark:28887
16
+ s216 dump/xvector/train_phn/spk_xvector.ark:30950
17
+ s220 dump/xvector/train_phn/spk_xvector.ark:33013
18
+ s221 dump/xvector/train_phn/spk_xvector.ark:35076
19
+ s222 dump/xvector/train_phn/spk_xvector.ark:37139
20
+ s223 dump/xvector/train_phn/spk_xvector.ark:39202
21
+ s225 dump/xvector/train_phn/spk_xvector.ark:41265
22
+ s226 dump/xvector/train_phn/spk_xvector.ark:43328
23
+ s228 dump/xvector/train_phn/spk_xvector.ark:45391
24
+ s231 dump/xvector/train_phn/spk_xvector.ark:47454
25
+ s234 dump/xvector/train_phn/spk_xvector.ark:49517
26
+ s235 dump/xvector/train_phn/spk_xvector.ark:51580
27
+ s236 dump/xvector/train_phn/spk_xvector.ark:53643
28
+ s240 dump/xvector/train_phn/spk_xvector.ark:55706
29
+ s247 dump/xvector/train_phn/spk_xvector.ark:57769
30
+ s250 dump/xvector/train_phn/spk_xvector.ark:59832
31
+ s251 dump/xvector/train_phn/spk_xvector.ark:61895
32
+ s256 dump/xvector/train_phn/spk_xvector.ark:63958
33
+ s258 dump/xvector/train_phn/spk_xvector.ark:66021
34
+ s264 dump/xvector/train_phn/spk_xvector.ark:68084
35
+ s268 dump/xvector/train_phn/spk_xvector.ark:70147
36
+ s273 dump/xvector/train_phn/spk_xvector.ark:72210
exp/tts_stats_raw_phn_none/train/feats_stats.npz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:104ea45b8b50ca02a2009fa374993b65a363eee37034b7b5d20f1dc3c11cc402
3
+ size 1402
exp/tts_train_xvector_tacotron2_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: ./conf/tuning/train_xvector_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_xvector_tacotron2_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 500
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 1
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: 500
68
+ batch_size: 20
69
+ valid_batch_size: null
70
+ batch_bins: 3750000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/tts_stats_raw_phn_none/train/text_shape.phn
74
+ - exp/tts_stats_raw_phn_none/train/speech_shape
75
+ valid_shape_file:
76
+ - exp/tts_stats_raw_phn_none/valid/text_shape.phn
77
+ - exp/tts_stats_raw_phn_none/valid/speech_shape
78
+ batch_type: numel
79
+ valid_batch_type: null
80
+ fold_length:
81
+ - 150
82
+ - 204800
83
+ sort_in_batch: descending
84
+ sort_batch: descending
85
+ multiple_iterator: false
86
+ chunk_length: 500
87
+ chunk_shift_ratio: 0.5
88
+ num_cache_chunks: 1024
89
+ train_data_path_and_name_and_type:
90
+ - - dump/raw/train_phn/text
91
+ - text
92
+ - text
93
+ - - dump/raw/train_phn/wav.scp
94
+ - speech
95
+ - sound
96
+ - - dump/xvector/train_phn/xvector.scp
97
+ - spembs
98
+ - kaldi_ark
99
+ valid_data_path_and_name_and_type:
100
+ - - dump/raw/dev_phn/text
101
+ - text
102
+ - text
103
+ - - dump/raw/dev_phn/wav.scp
104
+ - speech
105
+ - sound
106
+ - - dump/xvector/dev_phn/xvector.scp
107
+ - spembs
108
+ - kaldi_ark
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ valid_max_cache_size: null
113
+ optim: adam
114
+ optim_conf:
115
+ lr: 0.001
116
+ eps: 1.0e-06
117
+ weight_decay: 0.0
118
+ scheduler: null
119
+ scheduler_conf: {}
120
+ token_list:
121
+ - <blank>
122
+ - <unk>
123
+ - r
124
+ - a
125
+ - t
126
+ - I
127
+ - n
128
+ - s
129
+ - D
130
+ - Y
131
+ - E
132
+ - l
133
+ - v
134
+ - m
135
+ - h
136
+ - k
137
+ - 'a:'
138
+ - j
139
+ - 'E:'
140
+ - T
141
+ - f
142
+ - G
143
+ - p
144
+ - 'i:'
145
+ - 'au:'
146
+ - c
147
+ - 'O:'
148
+ - i
149
+ - r_0
150
+ - 'I:'
151
+ - t_h
152
+ - ei
153
+ - O
154
+ - k_h
155
+ - ou
156
+ - '9'
157
+ - 'u:'
158
+ - ai
159
+ - au
160
+ - 'ou:'
161
+ - u
162
+ - 'ei:'
163
+ - l_0
164
+ - N
165
+ - n_0
166
+ - '9:'
167
+ - p_h
168
+ - 'ai:'
169
+ - c_h
170
+ - 9i
171
+ - C
172
+ - '9i:'
173
+ - x
174
+ - 'Y:'
175
+ - N_0
176
+ - J
177
+ - m_0
178
+ - Yi
179
+ - Oi
180
+ - J_0
181
+ - <sos/eos>
182
+ odim: null
183
+ model_conf: {}
184
+ use_preprocessor: true
185
+ token_type: phn
186
+ bpemodel: null
187
+ non_linguistic_symbols: null
188
+ cleaner: null
189
+ g2p: null
190
+ feats_extract: fbank
191
+ feats_extract_conf:
192
+ n_fft: 1024
193
+ hop_length: 256
194
+ win_length: null
195
+ fs: 22050
196
+ fmin: 80
197
+ fmax: 7600
198
+ n_mels: 80
199
+ normalize: global_mvn
200
+ normalize_conf:
201
+ stats_file: exp/tts_stats_raw_phn_none/train/feats_stats.npz
202
+ tts: tacotron2
203
+ tts_conf:
204
+ embed_dim: 512
205
+ elayers: 1
206
+ eunits: 512
207
+ econv_layers: 3
208
+ econv_chans: 512
209
+ econv_filts: 5
210
+ atype: location
211
+ adim: 512
212
+ aconv_chans: 32
213
+ aconv_filts: 15
214
+ cumulate_att_w: true
215
+ dlayers: 2
216
+ dunits: 1024
217
+ prenet_layers: 2
218
+ prenet_units: 256
219
+ postnet_layers: 5
220
+ postnet_chans: 512
221
+ postnet_filts: 5
222
+ output_activation: null
223
+ use_batch_norm: true
224
+ use_concate: true
225
+ use_residual: false
226
+ spk_embed_dim: 512
227
+ spk_embed_integration_type: add
228
+ dropout_rate: 0.5
229
+ zoneout_rate: 0.1
230
+ reduction_factor: 1
231
+ use_masking: true
232
+ bce_pos_weight: 10.0
233
+ use_guided_attn_loss: true
234
+ guided_attn_loss_sigma: 0.4
235
+ guided_attn_loss_lambda: 1.0
236
+ pitch_extract: null
237
+ pitch_extract_conf: {}
238
+ pitch_normalize: null
239
+ pitch_normalize_conf: {}
240
+ energy_extract: null
241
+ energy_extract_conf: {}
242
+ energy_normalize: null
243
+ energy_normalize_conf: {}
244
+ required:
245
+ - output_dir
246
+ - token_list
247
+ version: '202204'
248
+ distributed: false
exp/tts_train_xvector_tacotron2_raw_phn_none/images/attn_loss.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/backward_time.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/bce_loss.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/forward_time.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/iter_time.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/l1_loss.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/loss.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/mse_loss.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/optim0_lr0.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/optim_step_time.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/images/train_time.png ADDED
exp/tts_train_xvector_tacotron2_raw_phn_none/valid.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9af8b7dd037c7e25c92221967315a4417a43c288ec4a8ad05ce3f90922c383f1
3
+ size 107874278
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: '202204'
2
+ files:
3
+ model_file: exp/tts_train_xvector_tacotron2_raw_phn_none/valid.loss.ave_5best.pth
4
+ python: "3.8.13 (default, Mar 28 2022, 11:38:47) \n[GCC 7.5.0]"
5
+ timestamp: 1658499401.402948
6
+ torch: 1.10.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_xvector_tacotron2_raw_phn_none/config.yaml