Siddhant commited on
Commit
48bbe69
1 Parent(s): 3f051b8

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - vctk
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/vctk_tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space_train.loss.best`
13
+ ♻️ Imported from https://zenodo.org/record/3986241/
14
+
15
+ This model was trained by kan-bayashi using vctk/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space/996epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbe21b7dc4d50f198233fdc46f47efba0b5f1ee432a5f34463dcf5eb2ac40a09
3
+ size 209755887
exp/tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_gst_fastspeech.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 1000
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 2
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 2400000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
58
+ - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/tr_no_dev/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
61
+ - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/dev/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 800
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/tr_no_dev/denorm/feats.scp
78
+ - speech
79
+ - npy
80
+ - - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/tr_no_dev/durations
81
+ - durations
82
+ - text_int
83
+ valid_data_path_and_name_and_type:
84
+ - - dump/raw/dev/text
85
+ - text
86
+ - text
87
+ - - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/dev/denorm/feats.scp
88
+ - speech
89
+ - npy
90
+ - - exp/tts_train_gst_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_use_att_constrainttrue_train.loss.best/dev/durations
91
+ - durations
92
+ - text_int
93
+ allow_variable_data_keys: false
94
+ max_cache_size: 0.0
95
+ valid_max_cache_size: null
96
+ optim: adam
97
+ optim_conf:
98
+ lr: 1.0
99
+ scheduler: noamlr
100
+ scheduler_conf:
101
+ model_size: 384
102
+ warmup_steps: 4000
103
+ token_list:
104
+ - <blank>
105
+ - <unk>
106
+ - OY0
107
+ - ''''
108
+ - OY2
109
+ - ER2
110
+ - UH0
111
+ - '!'
112
+ - EY0
113
+ - AW0
114
+ - AA0
115
+ - UH2
116
+ - UW2
117
+ - AY0
118
+ - AO2
119
+ - AO0
120
+ - AE2
121
+ - AH2
122
+ - AE0
123
+ - AA2
124
+ - IY2
125
+ - EH0
126
+ - AW2
127
+ - ZH
128
+ - AY2
129
+ - OY1
130
+ - IH2
131
+ - UW0
132
+ - EY2
133
+ - EH2
134
+ - OW2
135
+ - OW0
136
+ - '?'
137
+ - CH
138
+ - ER1
139
+ - TH
140
+ - UH1
141
+ - AW1
142
+ - JH
143
+ - Y
144
+ - SH
145
+ - NG
146
+ - ','
147
+ - G
148
+ - OW1
149
+ - AO1
150
+ - IY0
151
+ - UW1
152
+ - EY1
153
+ - AY1
154
+ - HH
155
+ - F
156
+ - ER0
157
+ - V
158
+ - P
159
+ - B
160
+ - AH1
161
+ - IY1
162
+ - IH0
163
+ - AA1
164
+ - EH1
165
+ - AE1
166
+ - M
167
+ - W
168
+ - K
169
+ - DH
170
+ - Z
171
+ - .
172
+ - L
173
+ - D
174
+ - IH1
175
+ - R
176
+ - S
177
+ - N
178
+ - T
179
+ - AH0
180
+ - <sos/eos>
181
+ odim: 80
182
+ model_conf: {}
183
+ use_preprocessor: true
184
+ token_type: phn
185
+ bpemodel: null
186
+ non_linguistic_symbols: null
187
+ cleaner: tacotron
188
+ g2p: g2p_en_no_space
189
+ feats_extract: null
190
+ feats_extract_conf: null
191
+ normalize: global_mvn
192
+ normalize_conf:
193
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
194
+ tts: fastspeech
195
+ tts_conf:
196
+ adim: 384
197
+ aheads: 4
198
+ elayers: 6
199
+ eunits: 1536
200
+ dlayers: 6
201
+ dunits: 1536
202
+ positionwise_layer_type: conv1d
203
+ positionwise_conv_kernel_size: 3
204
+ duration_predictor_layers: 2
205
+ duration_predictor_chans: 384
206
+ duration_predictor_kernel_size: 3
207
+ postnet_layers: 5
208
+ postnet_filts: 5
209
+ postnet_chans: 256
210
+ use_masking: true
211
+ use_scaled_pos_enc: true
212
+ encoder_normalize_before: false
213
+ decoder_normalize_before: false
214
+ reduction_factor: 1
215
+ init_type: xavier_uniform
216
+ init_enc_alpha: 1.0
217
+ init_dec_alpha: 1.0
218
+ transformer_enc_dropout_rate: 0.1
219
+ transformer_enc_positional_dropout_rate: 0.1
220
+ transformer_enc_attn_dropout_rate: 0.1
221
+ transformer_dec_dropout_rate: 0.1
222
+ transformer_dec_positional_dropout_rate: 0.1
223
+ transformer_dec_attn_dropout_rate: 0.1
224
+ use_gst: true
225
+ gst_heads: 8
226
+ gst_tokens: 128
227
+ required:
228
+ - output_dir
229
+ - token_list
230
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space/996epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597459367.34099
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_gst_fastspeech_raw_phn_tacotron_g2p_en_no_space/config.yaml