Siddhant commited on
Commit
2979580
1 Parent(s): ae4fc1a

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/ljspeech_tacotron2`
13
+ ♻️ Imported from https://zenodo.org/record/3989498/
14
+
15
+ This model was trained by kan-bayashi using ljspeech/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/199epoch.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fe4e34e9260b0970b20d6d5b27fa9355024713f4cc51ad14f87ee8d6518407
3
+ size 106846941
exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_tacotron2_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 5120000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw/train/text_shape.phn
58
+ - exp/tts_stats_raw/train/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw/valid/text_shape.phn
61
+ - exp/tts_stats_raw/valid/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 204800
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - dump/raw/tr_no_dev/wav.scp
78
+ - speech
79
+ - sound
80
+ valid_data_path_and_name_and_type:
81
+ - - dump/raw/dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/dev/wav.scp
85
+ - speech
86
+ - sound
87
+ allow_variable_data_keys: false
88
+ max_cache_size: 0.0
89
+ valid_max_cache_size: null
90
+ optim: adam
91
+ optim_conf:
92
+ lr: 0.001
93
+ eps: 1.0e-06
94
+ weight_decay: 0.0
95
+ scheduler: null
96
+ scheduler_conf: {}
97
+ token_list:
98
+ - <blank>
99
+ - <unk>
100
+ - ..
101
+ - OY0
102
+ - UH0
103
+ - AW0
104
+ - '!'
105
+ - OY2
106
+ - '?'
107
+ - UH2
108
+ - ER2
109
+ - ''''
110
+ - AA0
111
+ - IY2
112
+ - AW2
113
+ - AY0
114
+ - AH2
115
+ - UW2
116
+ - AE0
117
+ - OW2
118
+ - ZH
119
+ - AO2
120
+ - EY0
121
+ - OY1
122
+ - EH0
123
+ - UW0
124
+ - AA2
125
+ - AY2
126
+ - AE2
127
+ - IH2
128
+ - AO0
129
+ - EY2
130
+ - OW0
131
+ - EH2
132
+ - UH1
133
+ - TH
134
+ - AW1
135
+ - Y
136
+ - JH
137
+ - CH
138
+ - ER1
139
+ - G
140
+ - NG
141
+ - SH
142
+ - OW1
143
+ - .
144
+ - AY1
145
+ - EY1
146
+ - AO1
147
+ - IY0
148
+ - UW1
149
+ - IY1
150
+ - HH
151
+ - B
152
+ - AA1
153
+ - ','
154
+ - F
155
+ - ER0
156
+ - V
157
+ - AH1
158
+ - AE1
159
+ - P
160
+ - W
161
+ - EH1
162
+ - M
163
+ - IH0
164
+ - IH1
165
+ - Z
166
+ - K
167
+ - DH
168
+ - L
169
+ - R
170
+ - S
171
+ - D
172
+ - T
173
+ - N
174
+ - AH0
175
+ - <sos/eos>
176
+ odim: null
177
+ model_conf: {}
178
+ use_preprocessor: true
179
+ token_type: phn
180
+ bpemodel: null
181
+ non_linguistic_symbols: null
182
+ cleaner: tacotron
183
+ g2p: g2p_en_no_space
184
+ feats_extract: fbank
185
+ feats_extract_conf:
186
+ fs: 22050
187
+ fmin: 80
188
+ fmax: 7600
189
+ n_mels: 80
190
+ hop_length: 256
191
+ n_fft: 1024
192
+ win_length: null
193
+ normalize: global_mvn
194
+ normalize_conf:
195
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
196
+ tts: tacotron2
197
+ tts_conf:
198
+ embed_dim: 512
199
+ elayers: 1
200
+ eunits: 512
201
+ econv_layers: 3
202
+ econv_chans: 512
203
+ econv_filts: 5
204
+ atype: location
205
+ adim: 512
206
+ aconv_chans: 32
207
+ aconv_filts: 15
208
+ cumulate_att_w: true
209
+ dlayers: 2
210
+ dunits: 1024
211
+ prenet_layers: 2
212
+ prenet_units: 256
213
+ postnet_layers: 5
214
+ postnet_chans: 512
215
+ postnet_filts: 5
216
+ output_activation: null
217
+ use_batch_norm: true
218
+ use_concate: true
219
+ use_residual: false
220
+ dropout_rate: 0.5
221
+ zoneout_rate: 0.1
222
+ reduction_factor: 1
223
+ spk_embed_dim: null
224
+ use_masking: true
225
+ bce_pos_weight: 5.0
226
+ use_guided_attn_loss: true
227
+ guided_attn_loss_sigma: 0.4
228
+ guided_attn_loss_lambda: 1.0
229
+ required:
230
+ - output_dir
231
+ - token_list
232
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/199epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597749894.186349
6
+ torch: 1.6.0
7
+ yaml_files:
8
+ train_config: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml