Siddhant commited on
Commit
1cb9d7a
1 Parent(s): 384b14c

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
+ license: cc-by-4.0
10
+ ---
11
+ ## Example ESPnet2 TTS model
12
+ ### `kan-bayashi/ljspeech_transformer`
13
+ ♻️ Imported from https://zenodo.org/record/4039194/
14
+
15
+ This model was trained by kan-bayashi using ljspeech/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/config.yaml ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_transformer.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: 4
14
+ dist_rank: 0
15
+ local_rank: 0
16
+ dist_master_addr: localhost
17
+ dist_master_port: 53481
18
+ dist_launcher: null
19
+ multiprocessing_distributed: true
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_clip_type: 2.0
44
+ grad_noise: false
45
+ accum_grad: 2
46
+ no_forward_run: false
47
+ resume: true
48
+ train_dtype: float32
49
+ use_amp: false
50
+ log_interval: null
51
+ pretrain_path: []
52
+ pretrain_key: []
53
+ num_iters_per_epoch: 1000
54
+ batch_size: 20
55
+ valid_batch_size: null
56
+ batch_bins: 9000000
57
+ valid_batch_bins: null
58
+ train_shape_file:
59
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/text_shape.phn
60
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/speech_shape
61
+ valid_shape_file:
62
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/text_shape.phn
63
+ - exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/valid/speech_shape
64
+ batch_type: numel
65
+ valid_batch_type: null
66
+ fold_length:
67
+ - 150
68
+ - 204800
69
+ sort_in_batch: descending
70
+ sort_batch: descending
71
+ multiple_iterator: false
72
+ chunk_length: 500
73
+ chunk_shift_ratio: 0.5
74
+ num_cache_chunks: 1024
75
+ train_data_path_and_name_and_type:
76
+ - - dump/raw/tr_no_dev/text
77
+ - text
78
+ - text
79
+ - - dump/raw/tr_no_dev/wav.scp
80
+ - speech
81
+ - sound
82
+ valid_data_path_and_name_and_type:
83
+ - - dump/raw/dev/text
84
+ - text
85
+ - text
86
+ - - dump/raw/dev/wav.scp
87
+ - speech
88
+ - sound
89
+ allow_variable_data_keys: false
90
+ max_cache_size: 0.0
91
+ valid_max_cache_size: null
92
+ optim: adam
93
+ optim_conf:
94
+ lr: 1.0
95
+ scheduler: noamlr
96
+ scheduler_conf:
97
+ model_size: 512
98
+ warmup_steps: 8000
99
+ token_list:
100
+ - <blank>
101
+ - <unk>
102
+ - AH0
103
+ - N
104
+ - T
105
+ - D
106
+ - S
107
+ - R
108
+ - L
109
+ - DH
110
+ - K
111
+ - Z
112
+ - IH1
113
+ - IH0
114
+ - M
115
+ - EH1
116
+ - W
117
+ - P
118
+ - AE1
119
+ - AH1
120
+ - V
121
+ - ER0
122
+ - F
123
+ - ','
124
+ - AA1
125
+ - B
126
+ - HH
127
+ - IY1
128
+ - UW1
129
+ - IY0
130
+ - AO1
131
+ - EY1
132
+ - AY1
133
+ - .
134
+ - OW1
135
+ - SH
136
+ - NG
137
+ - G
138
+ - ER1
139
+ - CH
140
+ - JH
141
+ - Y
142
+ - AW1
143
+ - TH
144
+ - UH1
145
+ - EH2
146
+ - OW0
147
+ - EY2
148
+ - AO0
149
+ - IH2
150
+ - AE2
151
+ - AY2
152
+ - AA2
153
+ - UW0
154
+ - EH0
155
+ - OY1
156
+ - EY0
157
+ - AO2
158
+ - ZH
159
+ - OW2
160
+ - AE0
161
+ - UW2
162
+ - AH2
163
+ - AY0
164
+ - IY2
165
+ - AW2
166
+ - AA0
167
+ - ''''
168
+ - ER2
169
+ - UH2
170
+ - '?'
171
+ - OY2
172
+ - '!'
173
+ - AW0
174
+ - UH0
175
+ - OY0
176
+ - ..
177
+ - <sos/eos>
178
+ odim: null
179
+ model_conf: {}
180
+ use_preprocessor: true
181
+ token_type: phn
182
+ bpemodel: null
183
+ non_linguistic_symbols: null
184
+ cleaner: tacotron
185
+ g2p: g2p_en_no_space
186
+ feats_extract: fbank
187
+ feats_extract_conf:
188
+ fs: 22050
189
+ fmin: 80
190
+ fmax: 7600
191
+ n_mels: 80
192
+ hop_length: 256
193
+ n_fft: 1024
194
+ win_length: null
195
+ normalize: global_mvn
196
+ normalize_conf:
197
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
198
+ tts: transformer
199
+ tts_conf:
200
+ embed_dim: 0
201
+ eprenet_conv_layers: 0
202
+ eprenet_conv_filts: 0
203
+ eprenet_conv_chans: 0
204
+ dprenet_layers: 2
205
+ dprenet_units: 256
206
+ adim: 512
207
+ aheads: 8
208
+ elayers: 6
209
+ eunits: 1024
210
+ dlayers: 6
211
+ dunits: 1024
212
+ positionwise_layer_type: conv1d
213
+ positionwise_conv_kernel_size: 1
214
+ postnet_layers: 5
215
+ postnet_filts: 5
216
+ postnet_chans: 256
217
+ use_masking: true
218
+ bce_pos_weight: 5.0
219
+ use_scaled_pos_enc: true
220
+ encoder_normalize_before: true
221
+ decoder_normalize_before: true
222
+ reduction_factor: 1
223
+ init_type: xavier_uniform
224
+ init_enc_alpha: 1.0
225
+ init_dec_alpha: 1.0
226
+ eprenet_dropout_rate: 0.0
227
+ dprenet_dropout_rate: 0.5
228
+ postnet_dropout_rate: 0.5
229
+ transformer_enc_dropout_rate: 0.1
230
+ transformer_enc_positional_dropout_rate: 0.1
231
+ transformer_enc_attn_dropout_rate: 0.1
232
+ transformer_dec_dropout_rate: 0.1
233
+ transformer_dec_positional_dropout_rate: 0.1
234
+ transformer_dec_attn_dropout_rate: 0.1
235
+ transformer_enc_dec_attn_dropout_rate: 0.1
236
+ use_guided_attn_loss: true
237
+ num_heads_applied_guided_attn: 2
238
+ num_layers_applied_guided_attn: 2
239
+ modules_applied_guided_attn:
240
+ - encoder-decoder
241
+ guided_attn_loss_lambda: 10.0
242
+ pitch_extract: null
243
+ pitch_extract_conf: {}
244
+ pitch_normalize: null
245
+ pitch_normalize_conf: {}
246
+ energy_extract: null
247
+ energy_extract_conf: {}
248
+ energy_normalize: null
249
+ energy_normalize_conf: {}
250
+ required:
251
+ - output_dir
252
+ - token_list
253
+ distributed: true
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/backward_time.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/bce_loss.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/decoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/enc_dec_attn_loss.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/encoder_alpha.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/forward_time.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/iter_time.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/l1_loss.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/l2_loss.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/loss.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/lr_0.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/optim_step_time.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/images/train_time.png ADDED
exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61fb5c7180013afa60526105e2650e9eb5ae41c7d3336f9e3b1e01179c31417a
3
+ size 132380423
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1600568352.208606
6
+ torch: 1.5.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_transformer_raw_phn_tacotron_g2p_en_no_space/config.yaml