julien-c HF staff commited on
Commit
2d467a0
1 Parent(s): 32896b2

initial import from https://zenodo.org/record/3989498#.X90RlOlKjkM

Browse files
Files changed (4) hide show
  1. README.md +96 -0
  2. config.yaml +232 -0
  3. meta.yaml +8 -0
  4. pytorch_model.bin +3 -0
README.md ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - tts
6
+ language: en
7
+ datasets:
8
+ - ljspeech
9
+ license: cc-by-4.0
10
+ inference: false
11
+ ---
12
+
13
+ ## ESPnet2 model `kan-bayashi/ljspeech_tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space_train.loss.best`
14
+
15
+ ♻️ Imported from https://zenodo.org/record/3989498#.X90RlOlKjkM
16
+
17
+ This model was trained by kan-bayashi using ljspeech/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
18
+
19
+
20
+
21
+ ### Demo: How to use in ESPnet2
22
+
23
+ ```python
24
+ # coming soon
25
+ ```
26
+
27
+ ### Citing ESPnet
28
+
29
+ ```BibTex
30
+ @inproceedings{watanabe2018espnet,
31
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
32
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
33
+ year={2018},
34
+ booktitle={Proceedings of Interspeech},
35
+ pages={2207--2211},
36
+ doi={10.21437/Interspeech.2018-1456},
37
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
38
+ }
39
+ @inproceedings{hayashi2020espnet,
40
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
41
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
42
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
43
+ pages={7654--7658},
44
+ year={2020},
45
+ organization={IEEE}
46
+ }
47
+ @inproceedings{inaguma-etal-2020-espnet,
48
+ title = "{ESP}net-{ST}: All-in-One Speech Translation Toolkit",
49
+ author = "Inaguma, Hirofumi and
50
+ Kiyono, Shun and
51
+ Duh, Kevin and
52
+ Karita, Shigeki and
53
+ Yalta, Nelson and
54
+ Hayashi, Tomoki and
55
+ Watanabe, Shinji",
56
+ booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
57
+ month = jul,
58
+ year = "2020",
59
+ address = "Online",
60
+ publisher = "Association for Computational Linguistics",
61
+ url = "https://www.aclweb.org/anthology/2020.acl-demos.34",
62
+ pages = "302--311",
63
+ }
64
+ ```
65
+
66
+
67
+ ### Training config
68
+
69
+ See full config in [`config.yaml`](./config.yaml)
70
+
71
+ ```yaml
72
+ config: conf/tuning/train_tacotron2.yaml
73
+ print_config: false
74
+ log_level: INFO
75
+ dry_run: false
76
+ iterator_type: sequence
77
+ output_dir: exp/tts_train_tacotron2_raw
78
+ ngpu: 1
79
+ seed: 0
80
+ num_workers: 1
81
+ num_att_plot: 3
82
+ dist_backend: nccl
83
+ dist_init_method: env://
84
+ dist_world_size: null
85
+ dist_rank: null
86
+ local_rank: 0
87
+ dist_master_addr: null
88
+ dist_master_port: null
89
+ dist_launcher: null
90
+ multiprocessing_distributed: false
91
+ cudnn_enabled: true
92
+ cudnn_benchmark: false
93
+ cudnn_deterministic: true
94
+ ```
95
+
96
+
config.yaml ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_tacotron2_raw
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ cudnn_enabled: true
21
+ cudnn_benchmark: false
22
+ cudnn_deterministic: true
23
+ collect_stats: false
24
+ write_collected_feats: false
25
+ max_epoch: 200
26
+ patience: null
27
+ val_scheduler_criterion:
28
+ - valid
29
+ - loss
30
+ early_stopping_criterion:
31
+ - valid
32
+ - loss
33
+ - min
34
+ best_model_criterion:
35
+ - - valid
36
+ - loss
37
+ - min
38
+ - - train
39
+ - loss
40
+ - min
41
+ keep_nbest_models: 5
42
+ grad_clip: 1.0
43
+ grad_noise: false
44
+ accum_grad: 1
45
+ no_forward_run: false
46
+ resume: true
47
+ train_dtype: float32
48
+ log_interval: null
49
+ pretrain_path: []
50
+ pretrain_key: []
51
+ num_iters_per_epoch: null
52
+ batch_size: 20
53
+ valid_batch_size: null
54
+ batch_bins: 5120000
55
+ valid_batch_bins: null
56
+ train_shape_file:
57
+ - exp/tts_stats_raw/train/text_shape.phn
58
+ - exp/tts_stats_raw/train/speech_shape
59
+ valid_shape_file:
60
+ - exp/tts_stats_raw/valid/text_shape.phn
61
+ - exp/tts_stats_raw/valid/speech_shape
62
+ batch_type: numel
63
+ valid_batch_type: null
64
+ fold_length:
65
+ - 150
66
+ - 204800
67
+ sort_in_batch: descending
68
+ sort_batch: descending
69
+ multiple_iterator: false
70
+ chunk_length: 500
71
+ chunk_shift_ratio: 0.5
72
+ num_cache_chunks: 1024
73
+ train_data_path_and_name_and_type:
74
+ - - dump/raw/tr_no_dev/text
75
+ - text
76
+ - text
77
+ - - dump/raw/tr_no_dev/wav.scp
78
+ - speech
79
+ - sound
80
+ valid_data_path_and_name_and_type:
81
+ - - dump/raw/dev/text
82
+ - text
83
+ - text
84
+ - - dump/raw/dev/wav.scp
85
+ - speech
86
+ - sound
87
+ allow_variable_data_keys: false
88
+ max_cache_size: 0.0
89
+ valid_max_cache_size: null
90
+ optim: adam
91
+ optim_conf:
92
+ lr: 0.001
93
+ eps: 1.0e-06
94
+ weight_decay: 0.0
95
+ scheduler: null
96
+ scheduler_conf: {}
97
+ token_list:
98
+ - <blank>
99
+ - <unk>
100
+ - ..
101
+ - OY0
102
+ - UH0
103
+ - AW0
104
+ - '!'
105
+ - OY2
106
+ - '?'
107
+ - UH2
108
+ - ER2
109
+ - ''''
110
+ - AA0
111
+ - IY2
112
+ - AW2
113
+ - AY0
114
+ - AH2
115
+ - UW2
116
+ - AE0
117
+ - OW2
118
+ - ZH
119
+ - AO2
120
+ - EY0
121
+ - OY1
122
+ - EH0
123
+ - UW0
124
+ - AA2
125
+ - AY2
126
+ - AE2
127
+ - IH2
128
+ - AO0
129
+ - EY2
130
+ - OW0
131
+ - EH2
132
+ - UH1
133
+ - TH
134
+ - AW1
135
+ - Y
136
+ - JH
137
+ - CH
138
+ - ER1
139
+ - G
140
+ - NG
141
+ - SH
142
+ - OW1
143
+ - .
144
+ - AY1
145
+ - EY1
146
+ - AO1
147
+ - IY0
148
+ - UW1
149
+ - IY1
150
+ - HH
151
+ - B
152
+ - AA1
153
+ - ','
154
+ - F
155
+ - ER0
156
+ - V
157
+ - AH1
158
+ - AE1
159
+ - P
160
+ - W
161
+ - EH1
162
+ - M
163
+ - IH0
164
+ - IH1
165
+ - Z
166
+ - K
167
+ - DH
168
+ - L
169
+ - R
170
+ - S
171
+ - D
172
+ - T
173
+ - N
174
+ - AH0
175
+ - <sos/eos>
176
+ odim: null
177
+ model_conf: {}
178
+ use_preprocessor: true
179
+ token_type: phn
180
+ bpemodel: null
181
+ non_linguistic_symbols: null
182
+ cleaner: tacotron
183
+ g2p: g2p_en_no_space
184
+ feats_extract: fbank
185
+ feats_extract_conf:
186
+ fs: 22050
187
+ fmin: 80
188
+ fmax: 7600
189
+ n_mels: 80
190
+ hop_length: 256
191
+ n_fft: 1024
192
+ win_length: null
193
+ normalize: global_mvn
194
+ normalize_conf:
195
+ stats_file: exp/tts_stats_raw_phn_tacotron_g2p_en_no_space/train/feats_stats.npz
196
+ tts: tacotron2
197
+ tts_conf:
198
+ embed_dim: 512
199
+ elayers: 1
200
+ eunits: 512
201
+ econv_layers: 3
202
+ econv_chans: 512
203
+ econv_filts: 5
204
+ atype: location
205
+ adim: 512
206
+ aconv_chans: 32
207
+ aconv_filts: 15
208
+ cumulate_att_w: true
209
+ dlayers: 2
210
+ dunits: 1024
211
+ prenet_layers: 2
212
+ prenet_units: 256
213
+ postnet_layers: 5
214
+ postnet_chans: 512
215
+ postnet_filts: 5
216
+ output_activation: null
217
+ use_batch_norm: true
218
+ use_concate: true
219
+ use_residual: false
220
+ dropout_rate: 0.5
221
+ zoneout_rate: 0.1
222
+ reduction_factor: 1
223
+ spk_embed_dim: null
224
+ use_masking: true
225
+ bce_pos_weight: 5.0
226
+ use_guided_attn_loss: true
227
+ guided_attn_loss_sigma: 0.4
228
+ guided_attn_loss_lambda: 1.0
229
+ required:
230
+ - output_dir
231
+ - token_list
232
+ distributed: false
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.8.0
2
+ files:
3
+ model_file: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/199epoch.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1597749894.186349
6
+ torch: 1.6.0
7
+ yaml_files:
8
+ train_config: exp/tts_train_tacotron2_raw_phn_tacotron_g2p_en_no_space/config.yaml
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6fe4e34e9260b0970b20d6d5b27fa9355024713f4cc51ad14f87ee8d6518407
3
+ size 106846941