Siddhant commited on
Commit
09c7973
1 Parent(s): cef8af3

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: zh
7
+ datasets:
8
+ - csmsc
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/csmsc_tts_train_transformer_raw_phn_pypinyin_g2p_phone_train.loss.ave`
13
+ ♻️ Imported from https://zenodo.org/record/4034125/
14
+
15
+ This model was trained by kan-bayashi using csmsc/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/config.yaml ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 35399
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ pretrain_path: []
52
+ pretrain_key: []
53
+ num_iters_per_epoch: 1000
54
+ batch_size: 20
55
+ valid_batch_size: null
56
+ batch_bins: 9000000
57
+ valid_batch_bins: null
58
+ train_shape_file:
59
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/text_shape.phn
60
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/speech_shape
61
+ valid_shape_file:
62
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/text_shape.phn
63
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/speech_shape
64
+ batch_type: numel
65
+ valid_batch_type: null
66
+ fold_length:
67
+ - 150
68
+ - 240000
69
+ sort_in_batch: descending
70
+ sort_batch: descending
71
+ multiple_iterator: false
72
+ chunk_length: 500
73
+ chunk_shift_ratio: 0.5
74
+ num_cache_chunks: 1024
75
+ train_data_path_and_name_and_type:
76
+ - - dump/raw/tr_no_dev/text
77
+ - text
78
+ - text
79
+ - - dump/raw/tr_no_dev/wav.scp
80
+ - speech
81
+ - sound
82
+ valid_data_path_and_name_and_type:
83
+ - - dump/raw/dev/text
84
+ - text
85
+ - text
86
+ - - dump/raw/dev/wav.scp
87
+ - speech
88
+ - sound
89
+ allow_variable_data_keys: false
90
+ max_cache_size: 0.0
91
+ valid_max_cache_size: null
92
+ optim: adam
93
+ optim_conf:
94
+ lr: 1.0
95
+ scheduler: noamlr
96
+ scheduler_conf:
97
+ model_size: 512
98
+ warmup_steps: 8000
99
+ token_list:
100
+ - <blank>
101
+ - <unk>
102
+ - d
103
+ - sh
104
+ - j
105
+ - l
106
+ - "\u3002"
107
+ - zh
108
+ - "\uFF0C"
109
+ - i4
110
+ - x
111
+ - h
112
+ - b
113
+ - e
114
+ - g
115
+ - t
116
+ - m
117
+ - z
118
+ - q
119
+ - i1
120
+ - i3
121
+ - ch
122
+ - u4
123
+ - n
124
+ - f
125
+ - i2
126
+ - r
127
+ - k
128
+ - s
129
+ - e4
130
+ - ai4
131
+ - a1
132
+ - c
133
+ - p
134
+ - ian4
135
+ - uo3
136
+ - ao3
137
+ - ai2
138
+ - ao4
139
+ - an4
140
+ - ong1
141
+ - u3
142
+ - ing2
143
+ - en2
144
+ - e2
145
+ - u2
146
+ - ui4
147
+ - ian2
148
+ - iou3
149
+ - ang4
150
+ - u1
151
+ - iao4
152
+ - uo4
153
+ - eng2
154
+ - a4
155
+ - in1
156
+ - eng1
157
+ - ou3
158
+ - ang1
159
+ - ian1
160
+ - ou4
161
+ - ing1
162
+ - uo1
163
+ - an1
164
+ - ian3
165
+ - ie3
166
+ - a3
167
+ - ing4
168
+ - an3
169
+ - an2
170
+ - "\xFC4"
171
+ - iao3
172
+ - ei4
173
+ - ong2
174
+ - en1
175
+ - uei4
176
+ - "\xFCan2"
177
+ - ang2
178
+ - ang3
179
+ - iu4
180
+ - iang4
181
+ - ai3
182
+ - ao1
183
+ - ou1
184
+ - eng4
185
+ - iang3
186
+ - en3
187
+ - ai1
188
+ - ong4
189
+ - ie4
190
+ - e3
191
+ - ia1
192
+ - uo2
193
+ - ia4
194
+ - "\xFC3"
195
+ - uan1
196
+ - er2
197
+ - ei3
198
+ - ei2
199
+ - iang1
200
+ - i
201
+ - ing3
202
+ - en4
203
+ - "\xFC2"
204
+ - uan3
205
+ - e1
206
+ - in2
207
+ - iao1
208
+ - in4
209
+ - ie1
210
+ - ong3
211
+ - iang2
212
+ - ie2
213
+ - uan4
214
+ - a2
215
+ - ui3
216
+ - eng3
217
+ - uan2
218
+ - "\xFCe4"
219
+ - uai4
220
+ - ou2
221
+ - "\uFF1F"
222
+ - "\xFCe2"
223
+ - in3
224
+ - uang3
225
+ - uang1
226
+ - iu2
227
+ - en
228
+ - a
229
+ - ao2
230
+ - ua4
231
+ - un1
232
+ - ui1
233
+ - uei2
234
+ - iong4
235
+ - uang2
236
+ - v3
237
+ - ui2
238
+ - iao2
239
+ - uang4
240
+ - "\xFC1"
241
+ - ei1
242
+ - o2
243
+ - er4
244
+ - iou2
245
+ - iou4
246
+ - "\uFF01"
247
+ - ua1
248
+ - "\xFCan4"
249
+ - iu3
250
+ - un4
251
+ - "\xFCan3"
252
+ - uen2
253
+ - "\xFCn4"
254
+ - iu1
255
+ - un3
256
+ - uen4
257
+ - er3
258
+ - "\xFCn1"
259
+ - un2
260
+ - "\xFCn2"
261
+ - o4
262
+ - o1
263
+ - ua2
264
+ - uei1
265
+ - uei3
266
+ - ia3
267
+ - iong3
268
+ - ua3
269
+ - ia
270
+ - "\xFCe1"
271
+ - v4
272
+ - "\xFCan1"
273
+ - iong1
274
+ - ia2
275
+ - uai1
276
+ - iong2
277
+ - iou1
278
+ - uai3
279
+ - "\xFCe3"
280
+ - uen1
281
+ - uen3
282
+ - uai2
283
+ - o3
284
+ - er
285
+ - ve4
286
+ - io1
287
+ - "\xFCn3"
288
+ - u
289
+ - ou
290
+ - o
291
+ - ang
292
+ - ueng1
293
+ - v2
294
+ - uo
295
+ - ao
296
+ - ueng4
297
+ - ua
298
+ - ei
299
+ - uen
300
+ - an
301
+ - '2'
302
+ - ueng3
303
+ - iang
304
+ - "\xFC"
305
+ - ie
306
+ - "\uFF30"
307
+ - "\uFF22"
308
+ - ai
309
+ - <sos/eos>
310
+ odim: null
311
+ model_conf: {}
312
+ use_preprocessor: true
313
+ token_type: phn
314
+ bpemodel: null
315
+ non_linguistic_symbols: null
316
+ cleaner: null
317
+ g2p: pypinyin_g2p_phone
318
+ feats_extract: fbank
319
+ feats_extract_conf:
320
+ fs: 24000
321
+ fmin: 80
322
+ fmax: 7600
323
+ n_mels: 80
324
+ hop_length: 300
325
+ n_fft: 2048
326
+ win_length: 1200
327
+ normalize: global_mvn
328
+ normalize_conf:
329
+ stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz
330
+ tts: transformer
331
+ tts_conf:
332
+ embed_dim: 0
333
+ eprenet_conv_layers: 0
334
+ eprenet_conv_filts: 0
335
+ eprenet_conv_chans: 0
336
+ dprenet_layers: 2
337
+ dprenet_units: 256
338
+ adim: 512
339
+ aheads: 8
340
+ elayers: 6
341
+ eunits: 1024
342
+ dlayers: 6
343
+ dunits: 1024
344
+ positionwise_layer_type: conv1d
345
+ positionwise_conv_kernel_size: 1
346
+ postnet_layers: 5
347
+ postnet_filts: 5
348
+ postnet_chans: 256
349
+ use_masking: true
350
+ bce_pos_weight: 5.0
351
+ use_scaled_pos_enc: true
352
+ encoder_normalize_before: false
353
+ decoder_normalize_before: false
354
+ reduction_factor: 1
355
+ init_type: xavier_uniform
356
+ init_enc_alpha: 1.0
357
+ init_dec_alpha: 1.0
358
+ eprenet_dropout_rate: 0.0
359
+ dprenet_dropout_rate: 0.5
360
+ postnet_dropout_rate: 0.5
361
+ transformer_enc_dropout_rate: 0.1
362
+ transformer_enc_positional_dropout_rate: 0.1
363
+ transformer_enc_attn_dropout_rate: 0.1
364
+ transformer_dec_dropout_rate: 0.1
365
+ transformer_dec_positional_dropout_rate: 0.1
366
+ transformer_dec_attn_dropout_rate: 0.1
367
+ transformer_enc_dec_attn_dropout_rate: 0.1
368
+ use_guided_attn_loss: true
369
+ num_heads_applied_guided_attn: 2
370
+ num_layers_applied_guided_attn: 2
371
+ modules_applied_guided_attn:
372
+ - encoder-decoder
373
+ guided_attn_loss_lambda: 10.0
374
+ pitch_extract: null
375
+ pitch_extract_conf: {}
376
+ pitch_normalize: null
377
+ pitch_normalize_conf: {}
378
+ energy_extract: null
379
+ energy_extract_conf: {}
380
+ energy_normalize: null
381
+ energy_normalize_conf: {}
382
+ required:
383
+ - output_dir
384
+ - token_list
385
+ distributed: true
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/backward_time.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/bce_loss.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/decoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/enc_dec_attn_loss.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/encoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/forward_time.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/iter_time.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/l1_loss.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/l2_loss.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/loss.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/lr_0.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/optim_step_time.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/images/train_time.png ADDED
exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a827b3bca3d17c1df09f1bc4561265a6df4ad5808200e6a0f86f2b1415ff9f61
3
+ size 132805695
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1600329881.266315
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_transformer_raw_phn_pypinyin_g2p_phone/config.yaml