|
audio: |
|
eps: 1e-10 |
|
fft_size: 2048 |
|
filter_length: 1200 |
|
hop_size: 300 |
|
log_base: 10.0 |
|
mel_fmax: 7600 |
|
mel_fmin: 80 |
|
num_mels: 80 |
|
sampling_rate: 24000 |
|
win_length: 1200 |
|
window: hann |
|
lang: |
|
- de |
|
- en |
|
model: |
|
decoder: |
|
conv_filter_size: 1024 |
|
conv_kernel_size: |
|
- 9 |
|
- 1 |
|
dropout: 0.2 |
|
n_head: 2 |
|
n_layers: 6 |
|
scln: true |
|
emb_dim: 512 |
|
emb_reduction: 1 |
|
encoder: |
|
depth: 2 |
|
expansion: 2 |
|
fs2_dropout: 0.2 |
|
fs2_head: 2 |
|
fs2_layer: 4 |
|
kernel_size: 5 |
|
kind: fastspeech2 |
|
n_heads: 2 |
|
ve_energy_quantization: linear |
|
ve_n_bins: 256 |
|
ve_pitch_quantization: linear |
|
vp_dropout: 0.5 |
|
vp_filter_size: 256 |
|
vp_kernel_size: 3 |
|
gst: |
|
n_heads: 8 |
|
n_style_tokens: 2000 |
|
ref_enc_filters: |
|
- 32 |
|
- 32 |
|
- 64 |
|
- 64 |
|
- 128 |
|
- 128 |
|
max_seq_len: 1500 |
|
postnet: |
|
postnet_embedding_dim: 0 |
|
postnet_kernel_size: 5 |
|
postnet_n_convolutions: 5 |
|
punct_emb_dim: 16 |
|
resnet: |
|
encoder_type: ASP |
|
layers: |
|
- 3 |
|
- 4 |
|
- 6 |
|
- 3 |
|
num_filters: |
|
- 32 |
|
- 64 |
|
- 128 |
|
- 256 |
|
spkemb: |
|
kind: ResNetSE34V2 |
|
stats: |
|
energy_max: 305.466064453125 |
|
energy_min: -2.440225667951865 |
|
pitch_max: 656.2979356469282 |
|
pitch_min: -45.333167047555264 |
|
|