Siddhant commited on
Commit
ba56748
1 Parent(s): 64c9510

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: zh
7
+ datasets:
8
+ - csmsc
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/csmsc_tacotron2`
13
+ ♻️ Imported from https://zenodo.org/record/3969118/
14
+
15
+ This model was trained by kan-bayashi using csmsc/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/199epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10844464e4febdfeaae598b185b99ae6c46e2322f4ad93fcc63f20a13972a611
3
+ size 107274977
exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/config.yaml ADDED
@@ -0,0 +1,361 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/train.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_raw_phn_pypinyin_g2p_phone
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 3750000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/text_shape.phn
58
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/text_shape.phn
61
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 240000
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - dump/raw/tr_no_dev/wav.scp
78
+ - speech
79
+ - sound
80
+ valid_data_path_and_name_and_type:
81
+ - - dump/raw/dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/dev/wav.scp
85
+ - speech
86
+ - sound
87
+ allow_variable_data_keys: false
88
+ max_cache_size: 0.0
89
+ valid_max_cache_size: null
90
+ optim: adam
91
+ optim_conf:
92
+ lr: 0.001
93
+ eps: 1.0e-06
94
+ weight_decay: 0.0
95
+ scheduler: null
96
+ scheduler_conf: {}
97
+ token_list:
98
+ - <blank>
99
+ - <unk>
100
+ - "\uFF30"
101
+ - "\uFF22"
102
+ - "\xFC"
103
+ - an
104
+ - ueng3
105
+ - '2'
106
+ - uen
107
+ - ei
108
+ - ua
109
+ - ao
110
+ - u
111
+ - ueng4
112
+ - uo
113
+ - ang
114
+ - ou
115
+ - v2
116
+ - ueng1
117
+ - o
118
+ - io1
119
+ - "\xFCn3"
120
+ - er
121
+ - ve4
122
+ - o3
123
+ - uai2
124
+ - uen3
125
+ - uen1
126
+ - uai3
127
+ - "\xFCe3"
128
+ - iou1
129
+ - iong2
130
+ - ia2
131
+ - uai1
132
+ - iong1
133
+ - "\xFCan1"
134
+ - "\xFCe1"
135
+ - v4
136
+ - ua3
137
+ - ia
138
+ - iong3
139
+ - uei3
140
+ - ua2
141
+ - ia3
142
+ - uei1
143
+ - o1
144
+ - o4
145
+ - "\xFCn2"
146
+ - un2
147
+ - er3
148
+ - "\xFCn1"
149
+ - uen4
150
+ - un3
151
+ - iu1
152
+ - "\xFCn4"
153
+ - uen2
154
+ - "\xFCan3"
155
+ - un4
156
+ - "\xFCan4"
157
+ - iu3
158
+ - ua1
159
+ - uei2
160
+ - "\uFF01"
161
+ - iou4
162
+ - iou2
163
+ - er4
164
+ - o2
165
+ - ei1
166
+ - iao2
167
+ - uang4
168
+ - "\xFC1"
169
+ - ui2
170
+ - v3
171
+ - uang2
172
+ - iong4
173
+ - un1
174
+ - ui1
175
+ - ua4
176
+ - ao2
177
+ - en
178
+ - a
179
+ - iu2
180
+ - uang1
181
+ - uang3
182
+ - "\xFCe2"
183
+ - in3
184
+ - "\uFF1F"
185
+ - uai4
186
+ - "\xFCe4"
187
+ - uan2
188
+ - ou2
189
+ - eng3
190
+ - ui3
191
+ - uan4
192
+ - a2
193
+ - ie2
194
+ - ong3
195
+ - iang2
196
+ - ie1
197
+ - in4
198
+ - iao1
199
+ - e1
200
+ - in2
201
+ - en4
202
+ - uan3
203
+ - "\xFC2"
204
+ - ing3
205
+ - i
206
+ - ei2
207
+ - ei3
208
+ - iang1
209
+ - er2
210
+ - ia4
211
+ - uo2
212
+ - "\xFC3"
213
+ - uan1
214
+ - ia1
215
+ - e3
216
+ - ong4
217
+ - ie4
218
+ - ai1
219
+ - en3
220
+ - iang3
221
+ - eng4
222
+ - iang4
223
+ - ao1
224
+ - ou1
225
+ - ang2
226
+ - ai3
227
+ - iu4
228
+ - "\xFCan2"
229
+ - ang3
230
+ - en1
231
+ - ong2
232
+ - uei4
233
+ - ei4
234
+ - iao3
235
+ - "\xFC4"
236
+ - an2
237
+ - ing4
238
+ - an3
239
+ - a3
240
+ - ie3
241
+ - an1
242
+ - ian3
243
+ - uo1
244
+ - ing1
245
+ - ou4
246
+ - ian1
247
+ - ou3
248
+ - eng1
249
+ - ang1
250
+ - in1
251
+ - a4
252
+ - eng2
253
+ - uo4
254
+ - u1
255
+ - ang4
256
+ - iou3
257
+ - iao4
258
+ - ian2
259
+ - u2
260
+ - ui4
261
+ - e2
262
+ - en2
263
+ - u3
264
+ - ing2
265
+ - ao4
266
+ - ong1
267
+ - an4
268
+ - ai2
269
+ - ao3
270
+ - uo3
271
+ - ian4
272
+ - p
273
+ - c
274
+ - a1
275
+ - ai4
276
+ - e4
277
+ - s
278
+ - k
279
+ - r
280
+ - i2
281
+ - f
282
+ - n
283
+ - u4
284
+ - ch
285
+ - i3
286
+ - i1
287
+ - q
288
+ - z
289
+ - m
290
+ - t
291
+ - g
292
+ - b
293
+ - e
294
+ - h
295
+ - i4
296
+ - x
297
+ - "\uFF0C"
298
+ - zh
299
+ - "\u3002"
300
+ - l
301
+ - j
302
+ - sh
303
+ - d
304
+ - <sos/eos>
305
+ odim: null
306
+ model_conf: {}
307
+ use_preprocessor: true
308
+ token_type: phn
309
+ bpemodel: null
310
+ non_linguistic_symbols: null
311
+ cleaner: null
312
+ g2p: pypinyin_g2p_phone
313
+ feats_extract: fbank
314
+ feats_extract_conf:
315
+ fs: 24000
316
+ fmin: 80
317
+ fmax: 7600
318
+ n_mels: 80
319
+ hop_length: 300
320
+ n_fft: 2048
321
+ win_length: 1200
322
+ normalize: global_mvn
323
+ normalize_conf:
324
+ stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz
325
+ tts: tacotron2
326
+ tts_conf:
327
+ embed_dim: 512
328
+ elayers: 1
329
+ eunits: 512
330
+ econv_layers: 3
331
+ econv_chans: 512
332
+ econv_filts: 5
333
+ atype: location
334
+ adim: 512
335
+ aconv_chans: 32
336
+ aconv_filts: 15
337
+ cumulate_att_w: true
338
+ dlayers: 2
339
+ dunits: 1024
340
+ prenet_layers: 2
341
+ prenet_units: 256
342
+ postnet_layers: 5
343
+ postnet_chans: 512
344
+ postnet_filts: 5
345
+ output_activation: null
346
+ use_batch_norm: true
347
+ use_concate: true
348
+ use_residual: false
349
+ dropout_rate: 0.5
350
+ zoneout_rate: 0.1
351
+ reduction_factor: 1
352
+ spk_embed_dim: null
353
+ use_masking: true
354
+ bce_pos_weight: 5.0
355
+ use_guided_attn_loss: true
356
+ guided_attn_loss_sigma: 0.4
357
+ guided_attn_loss_lambda: 1.0
358
+ required:
359
+ - output_dir
360
+ - token_list
361
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/199epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1596252269.687669
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/config.yaml