GunnarThor commited on
Commit
b42013c
1 Parent(s): e9b96c4

Update model

Browse files
Files changed (21) hide show
  1. README.md +369 -0
  2. exp/g/tts_train_fastspeech2_raw_phn_none/config.yaml +291 -0
  3. exp/g/tts_train_fastspeech2_raw_phn_none/images/backward_time.png +0 -0
  4. exp/g/tts_train_fastspeech2_raw_phn_none/images/decoder_alpha.png +0 -0
  5. exp/g/tts_train_fastspeech2_raw_phn_none/images/duration_loss.png +0 -0
  6. exp/g/tts_train_fastspeech2_raw_phn_none/images/encoder_alpha.png +0 -0
  7. exp/g/tts_train_fastspeech2_raw_phn_none/images/energy_loss.png +0 -0
  8. exp/g/tts_train_fastspeech2_raw_phn_none/images/forward_time.png +0 -0
  9. exp/g/tts_train_fastspeech2_raw_phn_none/images/gpu_max_cached_mem_GB.png +0 -0
  10. exp/g/tts_train_fastspeech2_raw_phn_none/images/iter_time.png +0 -0
  11. exp/g/tts_train_fastspeech2_raw_phn_none/images/l1_loss.png +0 -0
  12. exp/g/tts_train_fastspeech2_raw_phn_none/images/loss.png +0 -0
  13. exp/g/tts_train_fastspeech2_raw_phn_none/images/optim0_lr0.png +0 -0
  14. exp/g/tts_train_fastspeech2_raw_phn_none/images/optim_step_time.png +0 -0
  15. exp/g/tts_train_fastspeech2_raw_phn_none/images/pitch_loss.png +0 -0
  16. exp/g/tts_train_fastspeech2_raw_phn_none/images/train_time.png +0 -0
  17. exp/g/tts_train_fastspeech2_raw_phn_none/train.loss.ave_5best.pth +3 -0
  18. exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz +0 -0
  19. exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz +0 -0
  20. exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz +0 -0
  21. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,369 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - talromur
9
+ license: cc-by-4.0
10
+ ---
11
+
12
+ ## ESPnet2 TTS model
13
+
14
+ ### `espnet/GunnarThor_talromur_g_fastspeech2`
15
+
16
+ This model was trained by Gunnar Thor using talromur recipe in [espnet](https://github.com/espnet/espnet/).
17
+
18
+ ### Demo: How to use in ESPnet2
19
+
20
+ ```bash
21
+ cd espnet
22
+ git checkout 49a284e69308d81c142b89795de255b4ce290c54
23
+ pip install -e .
24
+ cd egs2/talromur/tts1
25
+ ./run.sh --skip_data_prep false --skip_train true --download_model espnet/GunnarThor_talromur_g_fastspeech2
26
+ ```
27
+
28
+
29
+
30
+ ## TTS config
31
+
32
+ <details><summary>expand</summary>
33
+
34
+ ```
35
+ config: conf/tuning/train_fastspeech2.yaml
36
+ print_config: false
37
+ log_level: INFO
38
+ dry_run: false
39
+ iterator_type: sequence
40
+ output_dir: exp/g/tts_train_fastspeech2_raw_phn_none
41
+ ngpu: 1
42
+ seed: 0
43
+ num_workers: 1
44
+ num_att_plot: 3
45
+ dist_backend: nccl
46
+ dist_init_method: env://
47
+ dist_world_size: null
48
+ dist_rank: null
49
+ local_rank: 0
50
+ dist_master_addr: null
51
+ dist_master_port: null
52
+ dist_launcher: null
53
+ multiprocessing_distributed: false
54
+ unused_parameters: false
55
+ sharded_ddp: false
56
+ cudnn_enabled: true
57
+ cudnn_benchmark: false
58
+ cudnn_deterministic: true
59
+ collect_stats: false
60
+ write_collected_feats: false
61
+ max_epoch: 100
62
+ patience: null
63
+ val_scheduler_criterion:
64
+ - valid
65
+ - loss
66
+ early_stopping_criterion:
67
+ - valid
68
+ - loss
69
+ - min
70
+ best_model_criterion:
71
+ - - valid
72
+ - loss
73
+ - min
74
+ - - train
75
+ - loss
76
+ - min
77
+ keep_nbest_models: 5
78
+ nbest_averaging_interval: 0
79
+ grad_clip: 1.0
80
+ grad_clip_type: 2.0
81
+ grad_noise: false
82
+ accum_grad: 8
83
+ no_forward_run: false
84
+ resume: true
85
+ train_dtype: float32
86
+ use_amp: false
87
+ log_interval: null
88
+ use_matplotlib: true
89
+ use_tensorboard: true
90
+ use_wandb: false
91
+ wandb_project: null
92
+ wandb_id: null
93
+ wandb_entity: null
94
+ wandb_name: null
95
+ wandb_model_log_interval: -1
96
+ detect_anomaly: false
97
+ pretrain_path: null
98
+ init_param: []
99
+ ignore_init_mismatch: false
100
+ freeze_param: []
101
+ num_iters_per_epoch: 800
102
+ batch_size: 20
103
+ valid_batch_size: null
104
+ batch_bins: 2500000
105
+ valid_batch_bins: null
106
+ train_shape_file:
107
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
108
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
109
+ valid_shape_file:
110
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
111
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
112
+ batch_type: numel
113
+ valid_batch_type: null
114
+ fold_length:
115
+ - 150
116
+ - 204800
117
+ sort_in_batch: descending
118
+ sort_batch: descending
119
+ multiple_iterator: false
120
+ chunk_length: 500
121
+ chunk_shift_ratio: 0.5
122
+ num_cache_chunks: 1024
123
+ train_data_path_and_name_and_type:
124
+ - - dump/raw/train_g_phn/text
125
+ - text
126
+ - text
127
+ - - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/train_g_phn/durations
128
+ - durations
129
+ - text_int
130
+ - - dump/raw/train_g_phn/wav.scp
131
+ - speech
132
+ - sound
133
+ valid_data_path_and_name_and_type:
134
+ - - dump/raw/dev_g_phn/text
135
+ - text
136
+ - text
137
+ - - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/dev_g_phn/durations
138
+ - durations
139
+ - text_int
140
+ - - dump/raw/dev_g_phn/wav.scp
141
+ - speech
142
+ - sound
143
+ allow_variable_data_keys: false
144
+ max_cache_size: 0.0
145
+ max_cache_fd: 32
146
+ valid_max_cache_size: null
147
+ optim: adam
148
+ optim_conf:
149
+ lr: 1.0
150
+ scheduler: noamlr
151
+ scheduler_conf:
152
+ model_size: 384
153
+ warmup_steps: 4000
154
+ token_list:
155
+ - <blank>
156
+ - <unk>
157
+ - ','
158
+ - .
159
+ - r
160
+ - t
161
+ - n
162
+ - a0
163
+ - s
164
+ - I0
165
+ - D
166
+ - l
167
+ - Y0
168
+ - m
169
+ - v
170
+ - h
171
+ - E1
172
+ - k
173
+ - a:1
174
+ - E:1
175
+ - f
176
+ - G
177
+ - j
178
+ - T
179
+ - a1
180
+ - p
181
+ - c
182
+ - au:1
183
+ - i:1
184
+ - O:1
185
+ - I:1
186
+ - E0
187
+ - I1
188
+ - r_0
189
+ - t_h
190
+ - k_h
191
+ - Y1
192
+ - ei1
193
+ - i0
194
+ - ou:1
195
+ - ei:1
196
+ - u:1
197
+ - O1
198
+ - N
199
+ - l_0
200
+ - '91'
201
+ - ai0
202
+ - au1
203
+ - ou0
204
+ - n_0
205
+ - ei0
206
+ - O0
207
+ - ou1
208
+ - ai:1
209
+ - '9:1'
210
+ - ai1
211
+ - i1
212
+ - '90'
213
+ - au0
214
+ - c_h
215
+ - x
216
+ - 9i:1
217
+ - C
218
+ - p_h
219
+ - u0
220
+ - Y:1
221
+ - J
222
+ - 9i1
223
+ - u1
224
+ - 9i0
225
+ - N_0
226
+ - m_0
227
+ - J_0
228
+ - Oi1
229
+ - Yi0
230
+ - Yi1
231
+ - Oi0
232
+ - au:0
233
+ - '9:0'
234
+ - E:0
235
+ - <sos/eos>
236
+ odim: null
237
+ model_conf: {}
238
+ use_preprocessor: true
239
+ token_type: phn
240
+ bpemodel: null
241
+ non_linguistic_symbols: null
242
+ cleaner: null
243
+ g2p: null
244
+ feats_extract: fbank
245
+ feats_extract_conf:
246
+ n_fft: 1024
247
+ hop_length: 256
248
+ win_length: null
249
+ fs: 22050
250
+ fmin: 80
251
+ fmax: 7600
252
+ n_mels: 80
253
+ normalize: global_mvn
254
+ normalize_conf:
255
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
256
+ tts: fastspeech2
257
+ tts_conf:
258
+ adim: 384
259
+ aheads: 2
260
+ elayers: 4
261
+ eunits: 1536
262
+ dlayers: 4
263
+ dunits: 1536
264
+ positionwise_layer_type: conv1d
265
+ positionwise_conv_kernel_size: 3
266
+ duration_predictor_layers: 2
267
+ duration_predictor_chans: 256
268
+ duration_predictor_kernel_size: 3
269
+ postnet_layers: 5
270
+ postnet_filts: 5
271
+ postnet_chans: 256
272
+ use_masking: true
273
+ use_scaled_pos_enc: true
274
+ encoder_normalize_before: true
275
+ decoder_normalize_before: true
276
+ reduction_factor: 1
277
+ init_type: xavier_uniform
278
+ init_enc_alpha: 1.0
279
+ init_dec_alpha: 1.0
280
+ transformer_enc_dropout_rate: 0.2
281
+ transformer_enc_positional_dropout_rate: 0.2
282
+ transformer_enc_attn_dropout_rate: 0.2
283
+ transformer_dec_dropout_rate: 0.2
284
+ transformer_dec_positional_dropout_rate: 0.2
285
+ transformer_dec_attn_dropout_rate: 0.2
286
+ pitch_predictor_layers: 5
287
+ pitch_predictor_chans: 256
288
+ pitch_predictor_kernel_size: 5
289
+ pitch_predictor_dropout: 0.5
290
+ pitch_embed_kernel_size: 1
291
+ pitch_embed_dropout: 0.0
292
+ stop_gradient_from_pitch_predictor: true
293
+ energy_predictor_layers: 2
294
+ energy_predictor_chans: 256
295
+ energy_predictor_kernel_size: 3
296
+ energy_predictor_dropout: 0.5
297
+ energy_embed_kernel_size: 1
298
+ energy_embed_dropout: 0.0
299
+ stop_gradient_from_energy_predictor: false
300
+ pitch_extract: dio
301
+ pitch_extract_conf:
302
+ fs: 22050
303
+ n_fft: 1024
304
+ hop_length: 256
305
+ f0max: 400
306
+ f0min: 80
307
+ reduction_factor: 1
308
+ pitch_normalize: global_mvn
309
+ pitch_normalize_conf:
310
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
311
+ energy_extract: energy
312
+ energy_extract_conf:
313
+ fs: 22050
314
+ n_fft: 1024
315
+ hop_length: 256
316
+ win_length: null
317
+ reduction_factor: 1
318
+ energy_normalize: global_mvn
319
+ energy_normalize_conf:
320
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
321
+ required:
322
+ - output_dir
323
+ - token_list
324
+ version: 0.10.7a1
325
+ distributed: false
326
+ ```
327
+
328
+ </details>
329
+
330
+
331
+
332
+ ### Citing ESPnet
333
+
334
+ ```BibTex
335
+ @inproceedings{watanabe2018espnet,
336
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
337
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
338
+ year={2018},
339
+ booktitle={Proceedings of Interspeech},
340
+ pages={2207--2211},
341
+ doi={10.21437/Interspeech.2018-1456},
342
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
343
+ }
344
+
345
+
346
+
347
+
348
+ @inproceedings{hayashi2020espnet,
349
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
350
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
351
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
352
+ pages={7654--7658},
353
+ year={2020},
354
+ organization={IEEE}
355
+ }
356
+ ```
357
+
358
+ or arXiv:
359
+
360
+ ```bibtex
361
+ @misc{watanabe2018espnet,
362
+ title={ESPnet: End-to-End Speech Processing Toolkit},
363
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Yalta and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
364
+ year={2018},
365
+ eprint={1804.00015},
366
+ archivePrefix={arXiv},
367
+ primaryClass={cs.CL}
368
+ }
369
+ ```
exp/g/tts_train_fastspeech2_raw_phn_none/config.yaml ADDED
@@ -0,0 +1,291 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/g/tts_train_fastspeech2_raw_phn_none
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 100
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ nbest_averaging_interval: 0
45
+ grad_clip: 1.0
46
+ grad_clip_type: 2.0
47
+ grad_noise: false
48
+ accum_grad: 8
49
+ no_forward_run: false
50
+ resume: true
51
+ train_dtype: float32
52
+ use_amp: false
53
+ log_interval: null
54
+ use_matplotlib: true
55
+ use_tensorboard: true
56
+ use_wandb: false
57
+ wandb_project: null
58
+ wandb_id: null
59
+ wandb_entity: null
60
+ wandb_name: null
61
+ wandb_model_log_interval: -1
62
+ detect_anomaly: false
63
+ pretrain_path: null
64
+ init_param: []
65
+ ignore_init_mismatch: false
66
+ freeze_param: []
67
+ num_iters_per_epoch: 800
68
+ batch_size: 20
69
+ valid_batch_size: null
70
+ batch_bins: 2500000
71
+ valid_batch_bins: null
72
+ train_shape_file:
73
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
74
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
75
+ valid_shape_file:
76
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
77
+ - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
78
+ batch_type: numel
79
+ valid_batch_type: null
80
+ fold_length:
81
+ - 150
82
+ - 204800
83
+ sort_in_batch: descending
84
+ sort_batch: descending
85
+ multiple_iterator: false
86
+ chunk_length: 500
87
+ chunk_shift_ratio: 0.5
88
+ num_cache_chunks: 1024
89
+ train_data_path_and_name_and_type:
90
+ - - dump/raw/train_g_phn/text
91
+ - text
92
+ - text
93
+ - - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/train_g_phn/durations
94
+ - durations
95
+ - text_int
96
+ - - dump/raw/train_g_phn/wav.scp
97
+ - speech
98
+ - sound
99
+ valid_data_path_and_name_and_type:
100
+ - - dump/raw/dev_g_phn/text
101
+ - text
102
+ - text
103
+ - - exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/dev_g_phn/durations
104
+ - durations
105
+ - text_int
106
+ - - dump/raw/dev_g_phn/wav.scp
107
+ - speech
108
+ - sound
109
+ allow_variable_data_keys: false
110
+ max_cache_size: 0.0
111
+ max_cache_fd: 32
112
+ valid_max_cache_size: null
113
+ optim: adam
114
+ optim_conf:
115
+ lr: 1.0
116
+ scheduler: noamlr
117
+ scheduler_conf:
118
+ model_size: 384
119
+ warmup_steps: 4000
120
+ token_list:
121
+ - <blank>
122
+ - <unk>
123
+ - ','
124
+ - .
125
+ - r
126
+ - t
127
+ - n
128
+ - a0
129
+ - s
130
+ - I0
131
+ - D
132
+ - l
133
+ - Y0
134
+ - m
135
+ - v
136
+ - h
137
+ - E1
138
+ - k
139
+ - a:1
140
+ - E:1
141
+ - f
142
+ - G
143
+ - j
144
+ - T
145
+ - a1
146
+ - p
147
+ - c
148
+ - au:1
149
+ - i:1
150
+ - O:1
151
+ - I:1
152
+ - E0
153
+ - I1
154
+ - r_0
155
+ - t_h
156
+ - k_h
157
+ - Y1
158
+ - ei1
159
+ - i0
160
+ - ou:1
161
+ - ei:1
162
+ - u:1
163
+ - O1
164
+ - N
165
+ - l_0
166
+ - '91'
167
+ - ai0
168
+ - au1
169
+ - ou0
170
+ - n_0
171
+ - ei0
172
+ - O0
173
+ - ou1
174
+ - ai:1
175
+ - '9:1'
176
+ - ai1
177
+ - i1
178
+ - '90'
179
+ - au0
180
+ - c_h
181
+ - x
182
+ - 9i:1
183
+ - C
184
+ - p_h
185
+ - u0
186
+ - Y:1
187
+ - J
188
+ - 9i1
189
+ - u1
190
+ - 9i0
191
+ - N_0
192
+ - m_0
193
+ - J_0
194
+ - Oi1
195
+ - Yi0
196
+ - Yi1
197
+ - Oi0
198
+ - au:0
199
+ - '9:0'
200
+ - E:0
201
+ - <sos/eos>
202
+ odim: null
203
+ model_conf: {}
204
+ use_preprocessor: true
205
+ token_type: phn
206
+ bpemodel: null
207
+ non_linguistic_symbols: null
208
+ cleaner: null
209
+ g2p: null
210
+ feats_extract: fbank
211
+ feats_extract_conf:
212
+ n_fft: 1024
213
+ hop_length: 256
214
+ win_length: null
215
+ fs: 22050
216
+ fmin: 80
217
+ fmax: 7600
218
+ n_mels: 80
219
+ normalize: global_mvn
220
+ normalize_conf:
221
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
222
+ tts: fastspeech2
223
+ tts_conf:
224
+ adim: 384
225
+ aheads: 2
226
+ elayers: 4
227
+ eunits: 1536
228
+ dlayers: 4
229
+ dunits: 1536
230
+ positionwise_layer_type: conv1d
231
+ positionwise_conv_kernel_size: 3
232
+ duration_predictor_layers: 2
233
+ duration_predictor_chans: 256
234
+ duration_predictor_kernel_size: 3
235
+ postnet_layers: 5
236
+ postnet_filts: 5
237
+ postnet_chans: 256
238
+ use_masking: true
239
+ use_scaled_pos_enc: true
240
+ encoder_normalize_before: true
241
+ decoder_normalize_before: true
242
+ reduction_factor: 1
243
+ init_type: xavier_uniform
244
+ init_enc_alpha: 1.0
245
+ init_dec_alpha: 1.0
246
+ transformer_enc_dropout_rate: 0.2
247
+ transformer_enc_positional_dropout_rate: 0.2
248
+ transformer_enc_attn_dropout_rate: 0.2
249
+ transformer_dec_dropout_rate: 0.2
250
+ transformer_dec_positional_dropout_rate: 0.2
251
+ transformer_dec_attn_dropout_rate: 0.2
252
+ pitch_predictor_layers: 5
253
+ pitch_predictor_chans: 256
254
+ pitch_predictor_kernel_size: 5
255
+ pitch_predictor_dropout: 0.5
256
+ pitch_embed_kernel_size: 1
257
+ pitch_embed_dropout: 0.0
258
+ stop_gradient_from_pitch_predictor: true
259
+ energy_predictor_layers: 2
260
+ energy_predictor_chans: 256
261
+ energy_predictor_kernel_size: 3
262
+ energy_predictor_dropout: 0.5
263
+ energy_embed_kernel_size: 1
264
+ energy_embed_dropout: 0.0
265
+ stop_gradient_from_energy_predictor: false
266
+ pitch_extract: dio
267
+ pitch_extract_conf:
268
+ fs: 22050
269
+ n_fft: 1024
270
+ hop_length: 256
271
+ f0max: 400
272
+ f0min: 80
273
+ reduction_factor: 1
274
+ pitch_normalize: global_mvn
275
+ pitch_normalize_conf:
276
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
277
+ energy_extract: energy
278
+ energy_extract_conf:
279
+ fs: 22050
280
+ n_fft: 1024
281
+ hop_length: 256
282
+ win_length: null
283
+ reduction_factor: 1
284
+ energy_normalize: global_mvn
285
+ energy_normalize_conf:
286
+ stats_file: exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
287
+ required:
288
+ - output_dir
289
+ - token_list
290
+ version: 0.10.7a1
291
+ distributed: false
exp/g/tts_train_fastspeech2_raw_phn_none/images/backward_time.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/decoder_alpha.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/duration_loss.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/encoder_alpha.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/energy_loss.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/forward_time.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/gpu_max_cached_mem_GB.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/iter_time.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/l1_loss.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/loss.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/optim0_lr0.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/optim_step_time.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/pitch_loss.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/images/train_time.png ADDED
exp/g/tts_train_fastspeech2_raw_phn_none/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e4be2c0e6f2b16e2f68d9090cb62d54fd9e4d8ab41e0ee8347ae0700e492b76b
3
+ size 148726473
exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/g/tts_train_tacotron2_raw_phn_none/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz ADDED
Binary file (770 Bytes). View file
 
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.7a1
2
+ files:
3
+ model_file: exp/g/tts_train_fastspeech2_raw_phn_none/train.loss.ave_5best.pth
4
+ python: "3.8.12 (default, Oct 12 2021, 13:49:34) \n[GCC 7.5.0]"
5
+ timestamp: 1650468944.909435
6
+ torch: 1.10.2+cu102
7
+ yaml_files:
8
+ train_config: exp/g/tts_train_fastspeech2_raw_phn_none/config.yaml