Create new file
Browse files
@@ -0,0 +1,249 @@
1 |
2 |
3 |
- text-to-speech
4 |
- gronings
5 |
- Tacotron 2
6 |
language: gos
7 |
8 |
- gronings
9 |
10 |
## GroTTS Model
11 |
12 |
This model is trained with the [Tacotron 2]( architecture using approx. 2 hours of Gronings TTS dataset. For the best results, you need to download the vocoder separately from [here]( and then use the following code:
13 |
14 |
from espnet2.bin.tts_inference import Text2Speech
15 |
from import write
16 |
17 |
model = Text2Speech.from_pretrained(
18 |
19 |
20 |
21 |
output = model("This is a simple test.")
22 |
write("x.wav", 22050, output['wav'].numpy())
23 |
24 |
25 |
26 |
## TTS config
27 |
28 |
29 |
30 |
31 |
config: conf/train.yaml
32 |
print_config: false
33 |
log_level: INFO
34 |
dry_run: false
35 |
iterator_type: sequence
36 |
output_dir: exp/tts_train_raw_char_tacotron
37 |
ngpu: 1
38 |
seed: 0
39 |
num_workers: 1
40 |
num_att_plot: 3
41 |
dist_backend: nccl
42 |
dist_init_method: env://
43 |
dist_world_size: null
44 |
dist_rank: null
45 |
local_rank: 0
46 |
dist_master_addr: null
47 |
dist_master_port: null
48 |
dist_launcher: null
49 |
multiprocessing_distributed: false
50 |
unused_parameters: false
51 |
sharded_ddp: false
52 |
cudnn_enabled: true
53 |
cudnn_benchmark: false
54 |
cudnn_deterministic: true
55 |
collect_stats: false
56 |
write_collected_feats: false
57 |
max_epoch: 200
58 |
patience: null
59 |
60 |
- valid
61 |
- loss
62 |
63 |
- valid
64 |
- loss
65 |
- min
66 |
67 |
- - valid
68 |
- loss
69 |
- min
70 |
- - train
71 |
- loss
72 |
- min
73 |
keep_nbest_models: 5
74 |
nbest_averaging_interval: 0
75 |
grad_clip: 1.0
76 |
grad_clip_type: 2.0
77 |
grad_noise: false
78 |
accum_grad: 2
79 |
no_forward_run: false
80 |
resume: true
81 |
train_dtype: float32
82 |
use_amp: false
83 |
log_interval: null
84 |
use_matplotlib: true
85 |
use_tensorboard: true
86 |
use_wandb: false
87 |
wandb_project: null
88 |
wandb_id: null
89 |
wandb_entity: null
90 |
wandb_name: null
91 |
wandb_model_log_interval: -1
92 |
detect_anomaly: false
93 |
pretrain_path: null
94 |
init_param: []
95 |
ignore_init_mismatch: false
96 |
freeze_param: []
97 |
num_iters_per_epoch: 1000
98 |
batch_size: 20
99 |
valid_batch_size: null
100 |
batch_bins: 2000000
101 |
valid_batch_bins: null
102 |
103 |
- exp/tts_stats_raw_char_tacotron/train/text_shape.char
104 |
- exp/tts_stats_raw_char_tacotron/train/speech_shape
105 |
106 |
- exp/tts_stats_raw_char_tacotron/valid/text_shape.char
107 |
- exp/tts_stats_raw_char_tacotron/valid/speech_shape
108 |
batch_type: numel
109 |
valid_batch_type: null
110 |
111 |
- 150
112 |
- 204800
113 |
sort_in_batch: descending
114 |
sort_batch: descending
115 |
multiple_iterator: false
116 |
chunk_length: 500
117 |
chunk_shift_ratio: 0.5
118 |
num_cache_chunks: 1024
119 |
120 |
- - dump/raw/tr_no_dev/text
121 |
- text
122 |
- text
123 |
- - dump/raw/tr_no_dev/wav.scp
124 |
- speech
125 |
- sound
126 |
127 |
- - dump/raw/dev/text
128 |
- text
129 |
- text
130 |
- - dump/raw/dev/wav.scp
131 |
- speech
132 |
- sound
133 |
allow_variable_data_keys: false
134 |
max_cache_size: 0.0
135 |
max_cache_fd: 32
136 |
valid_max_cache_size: null
137 |
optim: adam
138 |
139 |
lr: 0.001
140 |
eps: 1.0e-06
141 |
weight_decay: 0.0
142 |
scheduler: null
143 |
scheduler_conf: {}
144 |
145 |
- <blank>
146 |
- <unk>
147 |
- <space>
148 |
- E
149 |
- N
150 |
- A
151 |
- O
152 |
- T
153 |
- I
154 |
- R
155 |
- D
156 |
- L
157 |
- S
158 |
- K
159 |
- M
160 |
- G
161 |
- U
162 |
- H
163 |
- .
164 |
- W
165 |
- V
166 |
- Z
167 |
- P
168 |
- B
169 |
- ','
170 |
- J
171 |
- C
172 |
- F
173 |
- '?'
174 |
- ''''
175 |
- '!'
176 |
- Y
177 |
- X
178 |
- '`'
179 |
- <sos/eos>
180 |
odim: null
181 |
model_conf: {}
182 |
use_preprocessor: true
183 |
token_type: char
184 |
bpemodel: null
185 |
non_linguistic_symbols: null
186 |
cleaner: tacotron
187 |
g2p: g2p_en
188 |
feats_extract: fbank
189 |
190 |
n_fft: 1024
191 |
hop_length: 256
192 |
win_length: null
193 |
fs: 22050
194 |
fmin: 80
195 |
fmax: 7600
196 |
n_mels: 80
197 |
normalize: global_mvn
198 |
199 |
stats_file: exp/tts_stats_raw_char_tacotron/train/feats_stats.npz
200 |
tts: tacotron2
201 |
202 |
embed_dim: 512
203 |
elayers: 1
204 |
eunits: 512
205 |
econv_layers: 3
206 |
econv_chans: 512
207 |
econv_filts: 5
208 |
atype: location
209 |
adim: 512
210 |
aconv_chans: 32
211 |
aconv_filts: 15
212 |
cumulate_att_w: true
213 |
dlayers: 2
214 |
dunits: 1024
215 |
prenet_layers: 2
216 |
prenet_units: 256
217 |
postnet_layers: 5
218 |
postnet_chans: 512
219 |
postnet_filts: 5
220 |
output_activation: null
221 |
use_batch_norm: true
222 |
use_concate: true
223 |
use_residual: false
224 |
dropout_rate: 0.5
225 |
zoneout_rate: 0.1
226 |
reduction_factor: 1
227 |
spk_embed_dim: null
228 |
use_masking: true
229 |
bce_pos_weight: 5.0
230 |
use_guided_attn_loss: true
231 |
guided_attn_loss_sigma: 0.4
232 |
guided_attn_loss_lambda: 1.0
233 |
pitch_extract: null
234 |
pitch_extract_conf: {}
235 |
pitch_normalize: null
236 |
pitch_normalize_conf: {}
237 |
energy_extract: null
238 |
energy_extract_conf: {}
239 |
energy_normalize: null
240 |
energy_normalize_conf: {}
241 |
242 |
- output_dir
243 |
- token_list
244 |
version: 0.10.7a1
245 |
distributed: false
246 |
247 |
248 |
249 |