free-svc / config.yaml
alefiury's picture
Upload 2 files
7782720 verified
defaults:
- common
train:
batch_size: 128
betas: [0.8, 0.99]
c_kl: 1.0
c_mel: 45
distributed: false # BUG: multi-gpu is not working
use_multiprocessing: false # BUG: multi-gpu is not working
epochs: 20
eps: 1e-9
fp16_run: false
init_lr_ratio: 1
raise_error: false
learning_rate: 2e-4
log_interval: 10
log_level: ${log_level}
lr_decay: 0.98
max_speclen: 128
port: 8005
resume_training: false # set to false to finetune from a model
seed: 1234
segment_size: 8960
use_sr: false
valid_epoch_interval: 1
valid_steps_interval: 1000
save_epoch_interval: 10
save_steps_interval: 1000
warmup_epochs: 0
# weighted_batch_speaker_sampling : false
# weighted_batch_lang_sampling : false
weighted_batch_speaker_sampling : 0.5
weighted_batch_lang_sampling : 0.5
data:
dataset_dir: /raid/lucasgris/free-svc/data
filter_length: 1280
hop_length: 320
max_wav_value: 32768.0
mel_fmax: null
mel_fmin: 0.0
n_mel_channels: 80
num_workers: 64
# For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk)
pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe
pitch_features_dir: ${data.dataset_dir}/pitch_features/
sampling_rate: 24000
spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space
# For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward
use_spk_emb: true
spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings
# SR augmentation is deprecated, set use_sr to False
sr_min_max: [68, 92]
# For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward
content_feature_dir: null
training_files: data/train.csv
validation_files: data/valid.csv
win_length: 1280
model:
save_dir: null
filter_channels: 768
finetune_from_model:
discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth
generator: /raid/lucasgris/free-svc/freevc-24.pth
hidden_channels: 192
inter_channels: 192
kernel_size: 3
n_heads: 2
n_layers_q: 3
n_layers: 6
p_dropout: 0.1
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
resblock_kernel_sizes: [3,7,11]
resblock: 1
c_dim: 768
upsample_initial_channel: 512
upsample_kernel_sizes: [16,16,4,4]
upsample_rates: [10,8,2,2]
use_spectral_norm: false
freeze_external_spk: true
device: cuda
# For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type
use_spk_emb: false
gin_channels: null # gin_channels = spk_encoder.embedding_dim
spk_encoder_type: null # ECAPA2SpeakerEncoder16k |
# For content feature extraction, set the content_encoder_type and content_encoder_ckpt
content_encoder_type: null # load from disk (data) - hubert | wavlm
content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best
post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck
coarse_f0: true
cond_f0_on_flow: false