sovits5.0 / configs /base.yaml
maxmax20160403's picture
mix
755994c
raw
history blame
1.57 kB
train:
model: "sovits"
seed: 1234
epochs: 10000
learning_rate: 2e-4
betas: [0.8, 0.99]
lr_decay: 0.999875
eps: 1e-9
batch_size: 8
c_stft: 9
c_mel: 1.
c_kl: 0.2
port: 8001
pretrain: ""
#############################
data:
training_files: "files/train.txt"
validation_files: "files/valid.txt"
segment_size: 8000 # WARNING: base on hop_length
max_wav_value: 32768.0
sampling_rate: 32000
filter_length: 1024
hop_length: 320
win_length: 1024
mel_channels: 100
mel_fmin: 50.0
mel_fmax: 16000.0
#############################
vits:
ppg_dim: 1024
vec_dim: 256
spk_dim: 256
gin_channels: 256
inter_channels: 192
hidden_channels: 192
filter_channels: 640
#############################
gen:
upsample_input: 192
upsample_rates: [5,4,4,2,2]
upsample_kernel_sizes: [15,8,8,4,4]
upsample_initial_channel: 320
resblock_kernel_sizes: [3,7,11]
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
#############################
mpd:
periods: [2,3,5,7,11]
kernel_size: 5
stride: 3
use_spectral_norm: False
lReLU_slope: 0.2
#############################
mrd:
resolutions: "[(1024, 120, 600), (2048, 240, 1200), (4096, 480, 2400), (512, 50, 240)]" # (filter_length, hop_length, win_length)
use_spectral_norm: False
lReLU_slope: 0.2
#############################
log:
info_interval: 100
eval_interval: 1
save_interval: 5
num_audio: 6
pth_dir: 'chkpt'
log_dir: 'logs'
#############################
dist_config:
dist_backend: "nccl"
dist_url: "tcp://localhost:54321"
world_size: 1