Fhrozen commited on
Commit
1ace703
1 Parent(s): 3a0f8f5
Files changed (17) hide show
  1. README.md +90 -0
  2. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/config.yaml +256 -0
  3. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/backward_time.png +0 -0
  4. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/duration_loss.png +0 -0
  5. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/energy_loss.png +0 -0
  6. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/forward_time.png +0 -0
  7. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/iter_time.png +0 -0
  8. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/l1_loss.png +0 -0
  9. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/loss.png +0 -0
  10. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/lr_0.png +0 -0
  11. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/optim_step_time.png +0 -0
  12. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/pitch_loss.png +0 -0
  13. exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/train_time.png +0 -0
  14. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/energy_stats.npz +0 -0
  15. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/feats_stats.npz +0 -0
  16. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/pitch_stats.npz +0 -0
  17. meta.yaml +8 -0
README.md CHANGED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
+ license: cc-by-4.0
10
+ widget:
11
+ - text: "Hello, how are you doing?"
12
+ ---
13
+
14
+ # ESPnet2 ASR pretrained model
15
+
16
+ ## `kan-bayashi/jsut_tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk_train.loss.ave`
17
+
18
+ ♻️ Imported from <https://zenodo.org/record/4017026#.YN70XJozZH4>
19
+
20
+ This model was trained by kan-bayashi using ljspeech/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
21
+
22
+ ### Demo: How to use in ESPnet2
23
+
24
+ ```python
25
+ # coming soon
26
+ ```
27
+
28
+ ### Citing ESPnet
29
+
30
+ ```BibTex
31
+ @inproceedings{watanabe2018espnet,
32
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
33
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
34
+ year={2018},
35
+ booktitle={Proceedings of Interspeech},
36
+ pages={2207--2211},
37
+ doi={10.21437/Interspeech.2018-1456},
38
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
39
+ }
40
+ @inproceedings{hayashi2020espnet,
41
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
42
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
43
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
44
+ pages={7654--7658},
45
+ year={2020},
46
+ organization={IEEE}
47
+ }
48
+ ```
49
+
50
+ or arXiv:
51
+
52
+ ```bibtex
53
+ @misc{watanabe2018espnet,
54
+ title={ESPnet: End-to-End Speech Processing Toolkit},
55
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
56
+ year={2018},
57
+ eprint={1804.00015},
58
+ archivePrefix={arXiv},
59
+ primaryClass={cs.CL}
60
+ }
61
+ ```
62
+
63
+ ### Training config
64
+
65
+ See full config in [`config.yaml`](./config.yaml)
66
+
67
+ ```yaml
68
+ config: conf/tuning/train_conformer_fastspeech2.yaml
69
+ print_config: false
70
+ log_level: INFO
71
+ dry_run: false
72
+ iterator_type: sequence
73
+ output_dir: exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk
74
+ ngpu: 1
75
+ seed: 0
76
+ num_workers: 1
77
+ num_att_plot: 3
78
+ dist_backend: nccl
79
+ dist_init_method: env://
80
+ dist_world_size: null
81
+ dist_rank: null
82
+ local_rank: 0
83
+ dist_master_addr: null
84
+ dist_master_port: null
85
+ dist_launcher: null
86
+ multiprocessing_distributed: false
87
+ cudnn_enabled: true
88
+ cudnn_benchmark: false
89
+ cudnn_deterministic: true
90
+ ```
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/config.yaml ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_conformer_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 1000
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 10
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ pretrain_path: []
52
+ pretrain_key: []
53
+ num_iters_per_epoch: null
54
+ batch_size: 20
55
+ valid_batch_size: null
56
+ batch_bins: 2400000
57
+ valid_batch_bins: null
58
+ train_shape_file:
59
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/text_shape.phn
60
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/speech_shape
61
+ valid_shape_file:
62
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/text_shape.phn
63
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/speech_shape
64
+ batch_type: numel
65
+ valid_batch_type: null
66
+ fold_length:
67
+ - 150
68
+ - 240000
69
+ sort_in_batch: descending
70
+ sort_batch: descending
71
+ multiple_iterator: false
72
+ chunk_length: 500
73
+ chunk_shift_ratio: 0.5
74
+ num_cache_chunks: 1024
75
+ train_data_path_and_name_and_type:
76
+ - - dump/raw/tr_no_dev/text
77
+ - text
78
+ - text
79
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/tr_no_dev/durations
80
+ - durations
81
+ - text_int
82
+ - - dump/raw/tr_no_dev/wav.scp
83
+ - speech
84
+ - sound
85
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/collect_feats/pitch.scp
86
+ - pitch
87
+ - npy
88
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/collect_feats/energy.scp
89
+ - energy
90
+ - npy
91
+ valid_data_path_and_name_and_type:
92
+ - - dump/raw/dev/text
93
+ - text
94
+ - text
95
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/dev/durations
96
+ - durations
97
+ - text_int
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/collect_feats/pitch.scp
102
+ - pitch
103
+ - npy
104
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/valid/collect_feats/energy.scp
105
+ - energy
106
+ - npy
107
+ allow_variable_data_keys: false
108
+ max_cache_size: 0.0
109
+ valid_max_cache_size: null
110
+ optim: adam
111
+ optim_conf:
112
+ lr: 1.0
113
+ scheduler: noamlr
114
+ scheduler_conf:
115
+ model_size: 384
116
+ warmup_steps: 4000
117
+ token_list:
118
+ - <blank>
119
+ - <unk>
120
+ - ty
121
+ - dy
122
+ - v
123
+ - py
124
+ - my
125
+ - by
126
+ - ny
127
+ - hy
128
+ - gy
129
+ - ry
130
+ - ky
131
+ - f
132
+ - p
133
+ - z
134
+ - ch
135
+ - ts
136
+ - j
137
+ - b
138
+ - y
139
+ - h
140
+ - cl
141
+ - I
142
+ - U
143
+ - w
144
+ - g
145
+ - d
146
+ - sh
147
+ - pau
148
+ - m
149
+ - N
150
+ - s
151
+ - r
152
+ - t
153
+ - n
154
+ - k
155
+ - e
156
+ - u
157
+ - i
158
+ - o
159
+ - a
160
+ - <sos/eos>
161
+ odim: null
162
+ model_conf: {}
163
+ use_preprocessor: true
164
+ token_type: phn
165
+ bpemodel: null
166
+ non_linguistic_symbols: null
167
+ cleaner: jaconv
168
+ g2p: pyopenjtalk
169
+ feats_extract: fbank
170
+ feats_extract_conf:
171
+ fs: 24000
172
+ fmin: 80
173
+ fmax: 7600
174
+ n_mels: 80
175
+ hop_length: 300
176
+ n_fft: 2048
177
+ win_length: 1200
178
+ normalize: global_mvn
179
+ normalize_conf:
180
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/feats_stats.npz
181
+ tts: fastspeech2
182
+ tts_conf:
183
+ adim: 384
184
+ aheads: 2
185
+ elayers: 4
186
+ eunits: 1536
187
+ dlayers: 4
188
+ dunits: 1536
189
+ positionwise_layer_type: conv1d
190
+ positionwise_conv_kernel_size: 3
191
+ duration_predictor_layers: 2
192
+ duration_predictor_chans: 256
193
+ duration_predictor_kernel_size: 3
194
+ postnet_layers: 5
195
+ postnet_filts: 5
196
+ postnet_chans: 256
197
+ use_masking: true
198
+ use_scaled_pos_enc: true
199
+ encoder_normalize_before: false
200
+ decoder_normalize_before: false
201
+ reduction_factor: 1
202
+ encoder_type: conformer
203
+ decoder_type: conformer
204
+ conformer_pos_enc_layer_type: rel_pos
205
+ conformer_self_attn_layer_type: rel_selfattn
206
+ conformer_activation_type: swish
207
+ use_macaron_style_in_conformer: true
208
+ use_cnn_in_conformer: true
209
+ conformer_enc_kernel_size: 7
210
+ conformer_dec_kernel_size: 31
211
+ init_type: xavier_uniform
212
+ init_enc_alpha: 1.0
213
+ init_dec_alpha: 1.0
214
+ transformer_enc_dropout_rate: 0.2
215
+ transformer_enc_positional_dropout_rate: 0.2
216
+ transformer_enc_attn_dropout_rate: 0.2
217
+ transformer_dec_dropout_rate: 0.2
218
+ transformer_dec_positional_dropout_rate: 0.2
219
+ transformer_dec_attn_dropout_rate: 0.2
220
+ pitch_predictor_layers: 2
221
+ pitch_predictor_chans: 256
222
+ pitch_predictor_kernel_size: 3
223
+ pitch_predictor_dropout: 0.5
224
+ pitch_embed_kernel_size: 1
225
+ pitch_embed_dropout: 0.0
226
+ stop_gradient_from_pitch_predictor: false
227
+ energy_predictor_layers: 5
228
+ energy_predictor_chans: 256
229
+ energy_predictor_kernel_size: 5
230
+ energy_predictor_dropout: 0.5
231
+ energy_embed_kernel_size: 1
232
+ energy_embed_dropout: 0.0
233
+ stop_gradient_from_energy_predictor: true
234
+ pitch_extract: dio
235
+ pitch_extract_conf:
236
+ fs: 24000
237
+ n_fft: 2048
238
+ hop_length: 300
239
+ f0max: 400
240
+ f0min: 80
241
+ pitch_normalize: global_mvn
242
+ pitch_normalize_conf:
243
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/pitch_stats.npz
244
+ energy_extract: energy
245
+ energy_extract_conf:
246
+ fs: 24000
247
+ n_fft: 2048
248
+ hop_length: 300
249
+ win_length: 1200
250
+ energy_normalize: global_mvn
251
+ energy_normalize_conf:
252
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/energy_stats.npz
253
+ required:
254
+ - output_dir
255
+ - token_list
256
+ distributed: false
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/backward_time.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/duration_loss.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/energy_loss.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/forward_time.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/iter_time.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/l1_loss.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/loss.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/lr_0.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/optim_step_time.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/pitch_loss.png ADDED
exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/images/train_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/energy_stats.npz ADDED
Binary file (770 Bytes). View file
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk/decode_tacotron2_teacher_forcing_train.loss.best/stats/train/pitch_stats.npz ADDED
Binary file (770 Bytes). View file
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1599439907.890524
6
+ torch: 1.6.0
7
+ yaml_files:
8
+ train_config: exp/tts_train_conformer_fastspeech2_raw_phn_jaconv_pyopenjtalk/config.yaml