Siddhant commited on
Commit
ca852c5
1 Parent(s): e9c1439

import from zenodo

Browse files
README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ tags:
3
+ - espnet
4
+ - audio
5
+ - text-to-speech
6
+ language: ja
7
+ datasets:
8
+ - jsut
9
+ license: cc-by-4.0
10
+ ---
11
+ ## ESPnet2 TTS pretrained model
12
+ ### `kan-bayashi/jsut_tacotron2_prosody`
13
+ ♻️ Imported from https://zenodo.org/record/5499026/
14
+
15
+ This model was trained by kan-bayashi using jsut/tts1 recipe in [espnet](https://github.com/espnet/espnet/).
16
+ ### Demo: How to use in ESPnet2
17
+ ```python
18
+ # coming soon
19
+ ```
20
+ ### Citing ESPnet
21
+ ```BibTex
22
+ @inproceedings{watanabe2018espnet,
23
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson {Enrique Yalta Soplin} and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
24
+ title={{ESPnet}: End-to-End Speech Processing Toolkit},
25
+ year={2018},
26
+ booktitle={Proceedings of Interspeech},
27
+ pages={2207--2211},
28
+ doi={10.21437/Interspeech.2018-1456},
29
+ url={http://dx.doi.org/10.21437/Interspeech.2018-1456}
30
+ }
31
+ @inproceedings{hayashi2020espnet,
32
+ title={{Espnet-TTS}: Unified, reproducible, and integratable open source end-to-end text-to-speech toolkit},
33
+ author={Hayashi, Tomoki and Yamamoto, Ryuichi and Inoue, Katsuki and Yoshimura, Takenori and Watanabe, Shinji and Toda, Tomoki and Takeda, Kazuya and Zhang, Yu and Tan, Xu},
34
+ booktitle={Proceedings of IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
35
+ pages={7654--7658},
36
+ year={2020},
37
+ organization={IEEE}
38
+ }
39
+ ```
40
+ or arXiv:
41
+ ```bibtex
42
+ @misc{watanabe2018espnet,
43
+ title={ESPnet: End-to-End Speech Processing Toolkit},
44
+ author={Shinji Watanabe and Takaaki Hori and Shigeki Karita and Tomoki Hayashi and Jiro Nishitoba and Yuya Unno and Nelson Enrique Yalta Soplin and Jahn Heymann and Matthew Wiesner and Nanxin Chen and Adithya Renduchintala and Tsubasa Ochiai},
45
+ year={2018},
46
+ eprint={1804.00015},
47
+ archivePrefix={arXiv},
48
+ primaryClass={cs.CL}
49
+ }
50
+ ```
exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/train/feats_stats.npz ADDED
Binary file (1.4 kB). View file
 
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml ADDED
@@ -0,0 +1,225 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ config: conf/tuning/train_tacotron2.yaml
2
+ print_config: false
3
+ log_level: INFO
4
+ dry_run: false
5
+ iterator_type: sequence
6
+ output_dir: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody
7
+ ngpu: 1
8
+ seed: 0
9
+ num_workers: 1
10
+ num_att_plot: 3
11
+ dist_backend: nccl
12
+ dist_init_method: env://
13
+ dist_world_size: null
14
+ dist_rank: null
15
+ local_rank: 0
16
+ dist_master_addr: null
17
+ dist_master_port: null
18
+ dist_launcher: null
19
+ multiprocessing_distributed: false
20
+ unused_parameters: false
21
+ sharded_ddp: false
22
+ cudnn_enabled: true
23
+ cudnn_benchmark: false
24
+ cudnn_deterministic: true
25
+ collect_stats: false
26
+ write_collected_feats: false
27
+ max_epoch: 200
28
+ patience: null
29
+ val_scheduler_criterion:
30
+ - valid
31
+ - loss
32
+ early_stopping_criterion:
33
+ - valid
34
+ - loss
35
+ - min
36
+ best_model_criterion:
37
+ - - valid
38
+ - loss
39
+ - min
40
+ - - train
41
+ - loss
42
+ - min
43
+ keep_nbest_models: 5
44
+ grad_clip: 1.0
45
+ grad_clip_type: 2.0
46
+ grad_noise: false
47
+ accum_grad: 1
48
+ no_forward_run: false
49
+ resume: true
50
+ train_dtype: float32
51
+ use_amp: false
52
+ log_interval: null
53
+ use_tensorboard: true
54
+ use_wandb: false
55
+ wandb_project: null
56
+ wandb_id: null
57
+ wandb_entity: null
58
+ wandb_name: null
59
+ wandb_model_log_interval: -1
60
+ detect_anomaly: false
61
+ pretrain_path: null
62
+ init_param: []
63
+ ignore_init_mismatch: false
64
+ freeze_param: []
65
+ num_iters_per_epoch: 500
66
+ batch_size: 20
67
+ valid_batch_size: null
68
+ batch_bins: 3750000
69
+ valid_batch_bins: null
70
+ train_shape_file:
71
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/train/text_shape.phn
72
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/train/speech_shape
73
+ valid_shape_file:
74
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/valid/text_shape.phn
75
+ - exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/valid/speech_shape
76
+ batch_type: numel
77
+ valid_batch_type: null
78
+ fold_length:
79
+ - 150
80
+ - 240000
81
+ sort_in_batch: descending
82
+ sort_batch: descending
83
+ multiple_iterator: false
84
+ chunk_length: 500
85
+ chunk_shift_ratio: 0.5
86
+ num_cache_chunks: 1024
87
+ train_data_path_and_name_and_type:
88
+ - - dump/raw/tr_no_dev/text
89
+ - text
90
+ - text
91
+ - - dump/raw/tr_no_dev/wav.scp
92
+ - speech
93
+ - sound
94
+ valid_data_path_and_name_and_type:
95
+ - - dump/raw/dev/text
96
+ - text
97
+ - text
98
+ - - dump/raw/dev/wav.scp
99
+ - speech
100
+ - sound
101
+ allow_variable_data_keys: false
102
+ max_cache_size: 0.0
103
+ max_cache_fd: 32
104
+ valid_max_cache_size: null
105
+ optim: adam
106
+ optim_conf:
107
+ lr: 0.001
108
+ eps: 1.0e-06
109
+ weight_decay: 0.0
110
+ scheduler: null
111
+ scheduler_conf: {}
112
+ token_list:
113
+ - <blank>
114
+ - <unk>
115
+ - a
116
+ - o
117
+ - i
118
+ - '['
119
+ - '#'
120
+ - u
121
+ - ']'
122
+ - e
123
+ - k
124
+ - n
125
+ - t
126
+ - r
127
+ - s
128
+ - N
129
+ - m
130
+ - _
131
+ - sh
132
+ - d
133
+ - g
134
+ - ^
135
+ - $
136
+ - w
137
+ - cl
138
+ - h
139
+ - y
140
+ - b
141
+ - j
142
+ - ts
143
+ - ch
144
+ - z
145
+ - p
146
+ - f
147
+ - ky
148
+ - ry
149
+ - gy
150
+ - hy
151
+ - ny
152
+ - by
153
+ - my
154
+ - py
155
+ - v
156
+ - dy
157
+ - '?'
158
+ - ty
159
+ - <sos/eos>
160
+ odim: null
161
+ model_conf: {}
162
+ use_preprocessor: true
163
+ token_type: phn
164
+ bpemodel: null
165
+ non_linguistic_symbols: null
166
+ cleaner: jaconv
167
+ g2p: pyopenjtalk_prosody
168
+ feats_extract: fbank
169
+ feats_extract_conf:
170
+ n_fft: 2048
171
+ hop_length: 300
172
+ win_length: 1200
173
+ fs: 24000
174
+ fmin: 80
175
+ fmax: 7600
176
+ n_mels: 80
177
+ normalize: global_mvn
178
+ normalize_conf:
179
+ stats_file: exp/tts_stats_raw_phn_jaconv_pyopenjtalk_prosody/train/feats_stats.npz
180
+ tts: tacotron2
181
+ tts_conf:
182
+ embed_dim: 512
183
+ elayers: 1
184
+ eunits: 512
185
+ econv_layers: 3
186
+ econv_chans: 512
187
+ econv_filts: 5
188
+ atype: location
189
+ adim: 512
190
+ aconv_chans: 32
191
+ aconv_filts: 15
192
+ cumulate_att_w: true
193
+ dlayers: 2
194
+ dunits: 1024
195
+ prenet_layers: 2
196
+ prenet_units: 256
197
+ postnet_layers: 5
198
+ postnet_chans: 512
199
+ postnet_filts: 5
200
+ output_activation: null
201
+ use_batch_norm: true
202
+ use_concate: true
203
+ use_residual: false
204
+ dropout_rate: 0.5
205
+ zoneout_rate: 0.1
206
+ reduction_factor: 1
207
+ spk_embed_dim: null
208
+ use_masking: true
209
+ bce_pos_weight: 5.0
210
+ use_guided_attn_loss: true
211
+ guided_attn_loss_sigma: 0.4
212
+ guided_attn_loss_lambda: 1.0
213
+ pitch_extract: null
214
+ pitch_extract_conf: {}
215
+ pitch_normalize: null
216
+ pitch_normalize_conf: {}
217
+ energy_extract: null
218
+ energy_extract_conf: {}
219
+ energy_normalize: null
220
+ energy_normalize_conf: {}
221
+ required:
222
+ - output_dir
223
+ - token_list
224
+ version: 0.10.3a1
225
+ distributed: false
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/attn_loss.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/backward_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/bce_loss.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/forward_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/gpu_max_cached_mem_GB.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/iter_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/l1_loss.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/loss.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/mse_loss.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/optim0_lr0.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/optim_step_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/images/train_time.png ADDED
exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/train.loss.ave_5best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e912d7489e27dae8d538b4d10a8eac49191b2763ef6affb0b2da9c8b2559ce
3
+ size 106960730
meta.yaml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ espnet: 0.10.3a2
2
+ files:
3
+ model_file: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/train.loss.ave_5best.pth
4
+ python: "3.7.3 (default, Mar 27 2019, 22:11:17) \n[GCC 7.3.0]"
5
+ timestamp: 1631246359.054751
6
+ torch: 1.7.1
7
+ yaml_files:
8
+ train_config: exp/tts_train_tacotron2_raw_phn_jaconv_pyopenjtalk_prosody/config.yaml