Siddhant commited on
Commit
a08d777
1 Parent(s): a183e4d

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/ljspeech_tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space_train.loss.best`
13
+ ♻️ Imported from https://zenodo.org/record/3986231/
14
+
15
+ This model was trained by kan-bayashi using ljspeech/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/1000epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bb534f3b328a68a7d248ab9186861558bef5b5f82ff12d0135346b7902893c8
3
+ size 207038998
exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_fastspeech.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 1000
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 6
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 800000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
58
+ - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/tr_no_dev/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
61
+ - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/dev/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 800
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/tr_no_dev/durations
78
+ - durations
79
+ - text_int
80
+ - - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/tr_no_dev/denorm/feats.scp
81
+ - speech
82
+ - npy
83
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/pitch.scp
84
+ - pitch
85
+ - npy
86
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/collect_feats/energy.scp
87
+ - energy
88
+ - npy
89
+ valid_data_path_and_name_and_type:
90
+ - - dump/raw/dev/text
91
+ - text
92
+ - text
93
+ - - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/dev/durations
94
+ - durations
95
+ - text_int
96
+ - - exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/decode_train.loss.best/dev/denorm/feats.scp
97
+ - speech
98
+ - npy
99
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/pitch.scp
100
+ - pitch
101
+ - npy
102
+ - - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/collect_feats/energy.scp
103
+ - energy
104
+ - npy
105
+ allow_variable_data_keys: false
106
+ max_cache_size: 0.0
107
+ valid_max_cache_size: null
108
+ optim: adam
109
+ optim_conf:
110
+ lr: 1.0
111
+ scheduler: noamlr
112
+ scheduler_conf:
113
+ model_size: 384
114
+ warmup_steps: 4000
115
+ token_list:
116
+ - <blank>
117
+ - <unk>
118
+ - ..
119
+ - OY0
120
+ - UH0
121
+ - AW0
122
+ - '!'
123
+ - OY2
124
+ - '?'
125
+ - UH2
126
+ - ER2
127
+ - ''''
128
+ - AA0
129
+ - IY2
130
+ - AW2
131
+ - AY0
132
+ - AH2
133
+ - UW2
134
+ - AE0
135
+ - OW2
136
+ - ZH
137
+ - AO2
138
+ - EY0
139
+ - OY1
140
+ - EH0
141
+ - UW0
142
+ - AA2
143
+ - AY2
144
+ - AE2
145
+ - IH2
146
+ - AO0
147
+ - EY2
148
+ - OW0
149
+ - EH2
150
+ - UH1
151
+ - TH
152
+ - AW1
153
+ - Y
154
+ - JH
155
+ - CH
156
+ - ER1
157
+ - G
158
+ - NG
159
+ - SH
160
+ - OW1
161
+ - .
162
+ - AY1
163
+ - EY1
164
+ - AO1
165
+ - IY0
166
+ - UW1
167
+ - IY1
168
+ - HH
169
+ - B
170
+ - AA1
171
+ - ','
172
+ - F
173
+ - ER0
174
+ - V
175
+ - AH1
176
+ - AE1
177
+ - P
178
+ - W
179
+ - EH1
180
+ - M
181
+ - IH0
182
+ - IH1
183
+ - Z
184
+ - K
185
+ - DH
186
+ - L
187
+ - R
188
+ - S
189
+ - D
190
+ - T
191
+ - N
192
+ - AH0
193
+ - <sos/eos>
194
+ odim: 80
195
+ model_conf: {}
196
+ use_preprocessor: true
197
+ token_type: phn
198
+ bpemodel: null
199
+ non_linguistic_symbols: null
200
+ cleaner: tacotron
201
+ g2p: g2p_en_no_space
202
+ feats_extract: null
203
+ feats_extract_conf: null
204
+ normalize: global_mvn
205
+ normalize_conf:
206
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
207
+ tts: fastspeech
208
+ tts_conf:
209
+ adim: 384
210
+ aheads: 2
211
+ elayers: 6
212
+ eunits: 1536
213
+ dlayers: 6
214
+ dunits: 1536
215
+ positionwise_layer_type: conv1d
216
+ positionwise_conv_kernel_size: 3
217
+ duration_predictor_layers: 2
218
+ duration_predictor_chans: 384
219
+ duration_predictor_kernel_size: 3
220
+ postnet_layers: 5
221
+ postnet_filts: 5
222
+ postnet_chans: 256
223
+ use_masking: true
224
+ use_scaled_pos_enc: true
225
+ encoder_normalize_before: false
226
+ decoder_normalize_before: false
227
+ reduction_factor: 1
228
+ init_type: xavier_uniform
229
+ init_enc_alpha: 1.0
230
+ init_dec_alpha: 1.0
231
+ transformer_enc_dropout_rate: 0.1
232
+ transformer_enc_positional_dropout_rate: 0.1
233
+ transformer_enc_attn_dropout_rate: 0.1
234
+ transformer_dec_dropout_rate: 0.1
235
+ transformer_dec_positional_dropout_rate: 0.1
236
+ transformer_dec_attn_dropout_rate: 0.1
237
+ pitch_extract: null
238
+ pitch_extract_conf:
239
+ fs: 22050
240
+ n_fft: 1024
241
+ hop_length: 256
242
+ f0max: 400
243
+ f0min: 80
244
+ pitch_normalize: null
245
+ pitch_normalize_conf:
246
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/pitch_stats.npz
247
+ energy_extract: null
248
+ energy_extract_conf:
249
+ fs: 22050
250
+ n_fft: 1024
251
+ hop_length: 256
252
+ win_length: null
253
+ energy_normalize: null
254
+ energy_normalize_conf:
255
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/energy_stats.npz
256
+ required:
257
+ - output_dir
258
+ - token_list
259
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/1000epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597459109.329321
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_fastspeech_raw_phn_tacotron_g2p_en_no_space/config.yaml