Siddhant commited on
Commit
0305952
1 Parent(s): 064e7b9

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: zh
7
+ datasets:
8
+ - csmsc
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/csmsc_fastspeech`
13
+ ♻️ Imported from https://zenodo.org/record/3986227/
14
+
15
+ This model was trained by kan-bayashi using csmsc/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone/999epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:625952bf0d0051b603bd36c0944f8e803e6be6de7333694a9ae730ff1fb08484
3
+ size 207237144
exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone/config.yaml ADDED
@@ -0,0 +1,388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 1000
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 6
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 800000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/text_shape.phn
58
+ - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/text_shape.phn
61
+ - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 800
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/durations
78
+ - durations
79
+ - text_int
80
+ - - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/tr_no_dev/denorm/feats.scp
81
+ - speech
82
+ - npy
83
+ - - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/pitch.scp
84
+ - pitch
85
+ - npy
86
+ - - exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/collect_feats/energy.scp
87
+ - energy
88
+ - npy
89
+ valid_data_path_and_name_and_type:
90
+ - - dump/raw/dev/text
91
+ - text
92
+ - text
93
+ - - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/durations
94
+ - durations
95
+ - text_int
96
+ - - exp/tts_train_tacotron2_raw_phn_pypinyin_g2p_phone/decode_train.loss.best/dev/denorm/feats.scp
97
+ - speech
98
+ - npy
99
+ - - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/pitch.scp
100
+ - pitch
101
+ - npy
102
+ - - exp/tts_stats_raw_phn_pypinyin_g2p_phone/valid/collect_feats/energy.scp
103
+ - energy
104
+ - npy
105
+ allow_variable_data_keys: false
106
+ max_cache_size: 0.0
107
+ valid_max_cache_size: null
108
+ optim: adam
109
+ optim_conf:
110
+ lr: 1.0
111
+ scheduler: noamlr
112
+ scheduler_conf:
113
+ model_size: 384
114
+ warmup_steps: 4000
115
+ token_list:
116
+ - <blank>
117
+ - <unk>
118
+ - "\uFF30"
119
+ - "\uFF22"
120
+ - "\xFC"
121
+ - an
122
+ - ueng3
123
+ - '2'
124
+ - uen
125
+ - ei
126
+ - ua
127
+ - ao
128
+ - u
129
+ - ueng4
130
+ - uo
131
+ - ang
132
+ - ou
133
+ - v2
134
+ - ueng1
135
+ - o
136
+ - io1
137
+ - "\xFCn3"
138
+ - er
139
+ - ve4
140
+ - o3
141
+ - uai2
142
+ - uen3
143
+ - uen1
144
+ - uai3
145
+ - "\xFCe3"
146
+ - iou1
147
+ - iong2
148
+ - ia2
149
+ - uai1
150
+ - iong1
151
+ - "\xFCan1"
152
+ - "\xFCe1"
153
+ - v4
154
+ - ua3
155
+ - ia
156
+ - iong3
157
+ - uei3
158
+ - ua2
159
+ - ia3
160
+ - uei1
161
+ - o1
162
+ - o4
163
+ - "\xFCn2"
164
+ - un2
165
+ - er3
166
+ - "\xFCn1"
167
+ - uen4
168
+ - un3
169
+ - iu1
170
+ - "\xFCn4"
171
+ - uen2
172
+ - "\xFCan3"
173
+ - un4
174
+ - "\xFCan4"
175
+ - iu3
176
+ - ua1
177
+ - uei2
178
+ - "\uFF01"
179
+ - iou4
180
+ - iou2
181
+ - er4
182
+ - o2
183
+ - ei1
184
+ - iao2
185
+ - uang4
186
+ - "\xFC1"
187
+ - ui2
188
+ - v3
189
+ - uang2
190
+ - iong4
191
+ - un1
192
+ - ui1
193
+ - ua4
194
+ - ao2
195
+ - en
196
+ - a
197
+ - iu2
198
+ - uang1
199
+ - uang3
200
+ - "\xFCe2"
201
+ - in3
202
+ - "\uFF1F"
203
+ - uai4
204
+ - "\xFCe4"
205
+ - uan2
206
+ - ou2
207
+ - eng3
208
+ - ui3
209
+ - uan4
210
+ - a2
211
+ - ie2
212
+ - ong3
213
+ - iang2
214
+ - ie1
215
+ - in4
216
+ - iao1
217
+ - e1
218
+ - in2
219
+ - en4
220
+ - uan3
221
+ - "\xFC2"
222
+ - ing3
223
+ - i
224
+ - ei2
225
+ - ei3
226
+ - iang1
227
+ - er2
228
+ - ia4
229
+ - uo2
230
+ - "\xFC3"
231
+ - uan1
232
+ - ia1
233
+ - e3
234
+ - ong4
235
+ - ie4
236
+ - ai1
237
+ - en3
238
+ - iang3
239
+ - eng4
240
+ - iang4
241
+ - ao1
242
+ - ou1
243
+ - ang2
244
+ - ai3
245
+ - iu4
246
+ - "\xFCan2"
247
+ - ang3
248
+ - en1
249
+ - ong2
250
+ - uei4
251
+ - ei4
252
+ - iao3
253
+ - "\xFC4"
254
+ - an2
255
+ - ing4
256
+ - an3
257
+ - a3
258
+ - ie3
259
+ - an1
260
+ - ian3
261
+ - uo1
262
+ - ing1
263
+ - ou4
264
+ - ian1
265
+ - ou3
266
+ - eng1
267
+ - ang1
268
+ - in1
269
+ - a4
270
+ - eng2
271
+ - uo4
272
+ - u1
273
+ - ang4
274
+ - iou3
275
+ - iao4
276
+ - ian2
277
+ - u2
278
+ - ui4
279
+ - e2
280
+ - en2
281
+ - u3
282
+ - ing2
283
+ - ao4
284
+ - ong1
285
+ - an4
286
+ - ai2
287
+ - ao3
288
+ - uo3
289
+ - ian4
290
+ - p
291
+ - c
292
+ - a1
293
+ - ai4
294
+ - e4
295
+ - s
296
+ - k
297
+ - r
298
+ - i2
299
+ - f
300
+ - n
301
+ - u4
302
+ - ch
303
+ - i3
304
+ - i1
305
+ - q
306
+ - z
307
+ - m
308
+ - t
309
+ - g
310
+ - b
311
+ - e
312
+ - h
313
+ - i4
314
+ - x
315
+ - "\uFF0C"
316
+ - zh
317
+ - "\u3002"
318
+ - l
319
+ - j
320
+ - sh
321
+ - d
322
+ - <sos/eos>
323
+ odim: 80
324
+ model_conf: {}
325
+ use_preprocessor: true
326
+ token_type: phn
327
+ bpemodel: null
328
+ non_linguistic_symbols: null
329
+ cleaner: null
330
+ g2p: pypinyin_g2p_phone
331
+ feats_extract: null
332
+ feats_extract_conf: null
333
+ normalize: global_mvn
334
+ normalize_conf:
335
+ stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/feats_stats.npz
336
+ tts: fastspeech
337
+ tts_conf:
338
+ adim: 384
339
+ aheads: 2
340
+ elayers: 6
341
+ eunits: 1536
342
+ dlayers: 6
343
+ dunits: 1536
344
+ positionwise_layer_type: conv1d
345
+ positionwise_conv_kernel_size: 3
346
+ duration_predictor_layers: 2
347
+ duration_predictor_chans: 384
348
+ duration_predictor_kernel_size: 3
349
+ postnet_layers: 5
350
+ postnet_filts: 5
351
+ postnet_chans: 256
352
+ use_masking: true
353
+ use_scaled_pos_enc: true
354
+ encoder_normalize_before: false
355
+ decoder_normalize_before: false
356
+ reduction_factor: 1
357
+ init_type: xavier_uniform
358
+ init_enc_alpha: 1.0
359
+ init_dec_alpha: 1.0
360
+ transformer_enc_dropout_rate: 0.1
361
+ transformer_enc_positional_dropout_rate: 0.1
362
+ transformer_enc_attn_dropout_rate: 0.1
363
+ transformer_dec_dropout_rate: 0.1
364
+ transformer_dec_positional_dropout_rate: 0.1
365
+ transformer_dec_attn_dropout_rate: 0.1
366
+ pitch_extract: null
367
+ pitch_extract_conf:
368
+ fs: 24000
369
+ n_fft: 2048
370
+ hop_length: 300
371
+ f0max: 400
372
+ f0min: 80
373
+ pitch_normalize: null
374
+ pitch_normalize_conf:
375
+ stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/pitch_stats.npz
376
+ energy_extract: null
377
+ energy_extract_conf:
378
+ fs: 24000
379
+ n_fft: 2048
380
+ hop_length: 300
381
+ win_length: 1200
382
+ energy_normalize: null
383
+ energy_normalize_conf:
384
+ stats_file: exp/tts_stats_raw_phn_pypinyin_g2p_phone/train/energy_stats.npz
385
+ required:
386
+ - output_dir
387
+ - token_list
388
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone/999epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597458509.935923
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_fastspeech_raw_phn_pypinyin_g2p_phone/config.yaml