Siddhant commited on
Commit
ebfe1f0
1 Parent(s): f8de2e7

import from zenodo

Browse files
Files changed (20) hide show
  1. README.md +50 -0
  2. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/config.yaml +297 -0
  3. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/backward_time.png +0 -0
  4. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/decoder_alpha.png +0 -0
  5. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/duration_loss.png +0 -0
  6. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/encoder_alpha.png +0 -0
  7. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/energy_loss.png +0 -0
  8. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/forward_time.png +0 -0
  9. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/iter_time.png +0 -0
  10. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/l1_loss.png +0 -0
  11. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/loss.png +0 -0
  12. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/lr_0.png +0 -0
  13. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/optim_step_time.png +0 -0
  14. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/pitch_loss.png +0 -0
  15. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/train_time.png +0 -0
  16. exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/train.loss.ave_5best.pth +3 -0
  17. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz +0 -0
  18. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz +0 -0
  19. exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz +0 -0
  20. meta.yaml +8 -0
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ja
7
+ datasets:
8
+ - jsut
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/jsut_fastspeech2_accent`
13
+ ♻️ Imported from https://zenodo.org/record/4381100/
14
+
15
+ This model was trained by kan-bayashi using jsut/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/config.yaml ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 55835
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 1
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ unused_parameters: false
52
+ use_tensorboard: true
53
+ use_wandb: false
54
+ wandb_project: null
55
+ wandb_id: null
56
+ pretrain_path: null
57
+ init_param: []
58
+ freeze_param: []
59
+ num_iters_per_epoch: 500
60
+ batch_size: 20
61
+ valid_batch_size: null
62
+ batch_bins: 18000000
63
+ valid_batch_bins: null
64
+ train_shape_file:
65
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/text_shape.phn
66
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/speech_shape
67
+ valid_shape_file:
68
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/text_shape.phn
69
+ - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/speech_shape
70
+ batch_type: numel
71
+ valid_batch_type: null
72
+ fold_length:
73
+ - 150
74
+ - 240000
75
+ sort_in_batch: descending
76
+ sort_batch: descending
77
+ multiple_iterator: false
78
+ chunk_length: 500
79
+ chunk_shift_ratio: 0.5
80
+ num_cache_chunks: 1024
81
+ train_data_path_and_name_and_type:
82
+ - - dump/raw/tr_no_dev/text
83
+ - text
84
+ - text
85
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/tr_no_dev/durations
86
+ - durations
87
+ - text_int
88
+ - - dump/raw/tr_no_dev/wav.scp
89
+ - speech
90
+ - sound
91
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/pitch.scp
92
+ - pitch
93
+ - npy
94
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/collect_feats/energy.scp
95
+ - energy
96
+ - npy
97
+ valid_data_path_and_name_and_type:
98
+ - - dump/raw/dev/text
99
+ - text
100
+ - text
101
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/dev/durations
102
+ - durations
103
+ - text_int
104
+ - - dump/raw/dev/wav.scp
105
+ - speech
106
+ - sound
107
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/pitch.scp
108
+ - pitch
109
+ - npy
110
+ - - exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/valid/collect_feats/energy.scp
111
+ - energy
112
+ - npy
113
+ allow_variable_data_keys: false
114
+ max_cache_size: 0.0
115
+ max_cache_fd: 32
116
+ valid_max_cache_size: null
117
+ optim: adam
118
+ optim_conf:
119
+ lr: 1.0
120
+ scheduler: noamlr
121
+ scheduler_conf:
122
+ model_size: 384
123
+ warmup_steps: 4000
124
+ token_list:
125
+ - <blank>
126
+ - <unk>
127
+ - '1'
128
+ - '2'
129
+ - '0'
130
+ - '3'
131
+ - '4'
132
+ - '-1'
133
+ - '5'
134
+ - a
135
+ - o
136
+ - '-2'
137
+ - i
138
+ - '-3'
139
+ - u
140
+ - e
141
+ - k
142
+ - n
143
+ - t
144
+ - '6'
145
+ - r
146
+ - '-4'
147
+ - s
148
+ - N
149
+ - m
150
+ - '7'
151
+ - sh
152
+ - d
153
+ - g
154
+ - w
155
+ - '8'
156
+ - U
157
+ - '-5'
158
+ - I
159
+ - cl
160
+ - h
161
+ - y
162
+ - b
163
+ - '9'
164
+ - j
165
+ - ts
166
+ - ch
167
+ - '-6'
168
+ - z
169
+ - p
170
+ - '-7'
171
+ - f
172
+ - ky
173
+ - ry
174
+ - '-8'
175
+ - gy
176
+ - '-9'
177
+ - hy
178
+ - ny
179
+ - '-10'
180
+ - by
181
+ - my
182
+ - '-11'
183
+ - '-12'
184
+ - '-13'
185
+ - py
186
+ - '-14'
187
+ - '-15'
188
+ - v
189
+ - '10'
190
+ - '-16'
191
+ - '-17'
192
+ - '11'
193
+ - '-21'
194
+ - '-20'
195
+ - '12'
196
+ - '-19'
197
+ - '13'
198
+ - '-18'
199
+ - '14'
200
+ - dy
201
+ - '15'
202
+ - ty
203
+ - '-22'
204
+ - '16'
205
+ - '18'
206
+ - '19'
207
+ - '17'
208
+ - <sos/eos>
209
+ odim: null
210
+ model_conf: {}
211
+ use_preprocessor: true
212
+ token_type: phn
213
+ bpemodel: null
214
+ non_linguistic_symbols: null
215
+ cleaner: jaconv
216
+ g2p: pyopenjtalk_accent
217
+ feats_extract: fbank
218
+ feats_extract_conf:
219
+ fs: 24000
220
+ fmin: 80
221
+ fmax: 7600
222
+ n_mels: 80
223
+ hop_length: 300
224
+ n_fft: 2048
225
+ win_length: 1200
226
+ normalize: global_mvn
227
+ normalize_conf:
228
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz
229
+ tts: fastspeech2
230
+ tts_conf:
231
+ adim: 384
232
+ aheads: 2
233
+ elayers: 4
234
+ eunits: 1536
235
+ dlayers: 4
236
+ dunits: 1536
237
+ positionwise_layer_type: conv1d
238
+ positionwise_conv_kernel_size: 3
239
+ duration_predictor_layers: 2
240
+ duration_predictor_chans: 256
241
+ duration_predictor_kernel_size: 3
242
+ postnet_layers: 5
243
+ postnet_filts: 5
244
+ postnet_chans: 256
245
+ use_masking: true
246
+ use_scaled_pos_enc: true
247
+ encoder_normalize_before: true
248
+ decoder_normalize_before: true
249
+ reduction_factor: 1
250
+ init_type: xavier_uniform
251
+ init_enc_alpha: 1.0
252
+ init_dec_alpha: 1.0
253
+ transformer_enc_dropout_rate: 0.2
254
+ transformer_enc_positional_dropout_rate: 0.2
255
+ transformer_enc_attn_dropout_rate: 0.2
256
+ transformer_dec_dropout_rate: 0.2
257
+ transformer_dec_positional_dropout_rate: 0.2
258
+ transformer_dec_attn_dropout_rate: 0.2
259
+ pitch_predictor_layers: 5
260
+ pitch_predictor_chans: 256
261
+ pitch_predictor_kernel_size: 5
262
+ pitch_predictor_dropout: 0.5
263
+ pitch_embed_kernel_size: 1
264
+ pitch_embed_dropout: 0.0
265
+ stop_gradient_from_pitch_predictor: true
266
+ energy_predictor_layers: 2
267
+ energy_predictor_chans: 256
268
+ energy_predictor_kernel_size: 3
269
+ energy_predictor_dropout: 0.5
270
+ energy_embed_kernel_size: 1
271
+ energy_embed_dropout: 0.0
272
+ stop_gradient_from_energy_predictor: false
273
+ pitch_extract: dio
274
+ pitch_extract_conf:
275
+ fs: 24000
276
+ n_fft: 2048
277
+ hop_length: 300
278
+ f0max: 400
279
+ f0min: 80
280
+ reduction_factor: 1
281
+ pitch_normalize: global_mvn
282
+ pitch_normalize_conf:
283
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz
284
+ energy_extract: energy
285
+ energy_extract_conf:
286
+ fs: 24000
287
+ n_fft: 2048
288
+ hop_length: 300
289
+ win_length: 1200
290
+ reduction_factor: 1
291
+ energy_normalize: global_mvn
292
+ energy_normalize_conf:
293
+ stats_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz
294
+ required:
295
+ - output_dir
296
+ - token_list
297
+ distributed: true
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/backward_time.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/decoder_alpha.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/duration_loss.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/encoder_alpha.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/energy_loss.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/forward_time.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/iter_time.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/l1_loss.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/loss.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/lr_0.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/optim_step_time.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/pitch_loss.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/images/train_time.png ADDED
exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c4252466117f6b9765cf442e05a9eec316a189a093cae3c97e5be38f4087f0
3
+ size 148862757
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/energy_stats.npz ADDED
Binary file (770 Bytes). View file
 
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_accent/decode_use_teacher_forcingtrue_train.loss.ave/stats/train/pitch_stats.npz ADDED
Binary file (770 Bytes). View file
 
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1608519920.243737
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_fastspeech2_tacotron2_teacher_raw_phn_jaconv_pyopenjtalk_accent/config.yaml