alefiury
/

free-svc

voice conversion

singing voice conversion

Model card Files Files and versions Community

free-svc / config.yaml

alefiury's picture

Upload 2 files

7782720 verified about 1 month ago

history blame contribute delete

3.27 kB

	defaults:
	- common

	train:
	batch_size: 128
	betas: [0.8, 0.99]
	c_kl: 1.0
	c_mel: 45
	distributed: false # BUG: multi-gpu is not working
	use_multiprocessing: false # BUG: multi-gpu is not working
	epochs: 20
	eps: 1e-9
	fp16_run: false
	init_lr_ratio: 1
	raise_error: false
	learning_rate: 2e-4
	log_interval: 10
	log_level: ${log_level}
	lr_decay: 0.98
	max_speclen: 128
	port: 8005
	resume_training: false # set to false to finetune from a model
	seed: 1234
	segment_size: 8960
	use_sr: false
	valid_epoch_interval: 1
	valid_steps_interval: 1000
	save_epoch_interval: 10
	save_steps_interval: 1000
	warmup_epochs: 0
	# weighted_batch_speaker_sampling : false
	# weighted_batch_lang_sampling : false
	weighted_batch_speaker_sampling : 0.5
	weighted_batch_lang_sampling : 0.5

	data:
	dataset_dir: /raid/lucasgris/free-svc/data
	filter_length: 1280
	hop_length: 320
	max_wav_value: 32768.0
	mel_fmax: null
	mel_fmin: 0.0
	n_mel_channels: 80
	num_workers: 64
	# For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk)
	pitch_predictor: rmvpe # pm \| crepe \| harvest \| dio \| rmvpe \| fcpe
	pitch_features_dir: ${data.dataset_dir}/pitch_features/
	sampling_rate: 24000
	spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space
	# For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward
	use_spk_emb: true
	spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings
	# SR augmentation is deprecated, set use_sr to False
	sr_min_max: [68, 92]
	# For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward
	content_feature_dir: null
	training_files: data/train.csv
	validation_files: data/valid.csv
	win_length: 1280

	model:
	save_dir: null
	filter_channels: 768
	finetune_from_model:
	discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth
	generator: /raid/lucasgris/free-svc/freevc-24.pth
	hidden_channels: 192
	inter_channels: 192
	kernel_size: 3
	n_heads: 2
	n_layers_q: 3
	n_layers: 6
	p_dropout: 0.1
	resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
	resblock_kernel_sizes: [3,7,11]
	resblock: 1
	c_dim: 768
	upsample_initial_channel: 512
	upsample_kernel_sizes: [16,16,4,4]
	upsample_rates: [10,8,2,2]
	use_spectral_norm: false
	freeze_external_spk: true
	device: cuda
	# For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type
	use_spk_emb: false
	gin_channels: null # gin_channels = spk_encoder.embedding_dim
	spk_encoder_type: null # ECAPA2SpeakerEncoder16k \|
	# For content feature extraction, set the content_encoder_type and content_encoder_ckpt
	content_encoder_type: null # load from disk (data) - hubert \| wavlm
	content_encoder_ckpt: null # load from disk (data) - [path] \| models/wavlm/WavLM-Large.pt \| lengyue233/content-vec-best
	post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck
	coarse_f0: true
	cond_f0_on_flow: false