{ "base_config": "config/vitssvc.json", "model_type": "VitsSVC", "dataset": [ "m4singer", "opencpop", "opensinger", "svcc", "vctk" ], "dataset_path": { // TODO: Fill in your dataset path "m4singer": "[M4Singer dataset path]", "opencpop": "[Opencpop dataset path]", "opensinger": "[OpenSinger dataset path]", "svcc": "[SVCC dataset path]", "vctk": "[VCTK dataset path]" }, // TODO: Fill in the output log path. The default value is "Amphion/ckpts/svc" "log_dir": "ckpts/svc", "preprocess": { // TODO: Fill in the output data path. The default value is "Amphion/data" "processed_dir": "data", "f0_min": 50, "f0_max": 1100, // f0_bin in sovits "pitch_bin": 256, // filter_length in sovits "n_fft": 2048, // hop_length in sovits "hop_size": 512, // win_length in sovits "win_size": 2048, "segment_size": 8192, "n_mel": 100, "sample_rate": 44100, // Config for features extraction "extract_mel": true, "extract_pitch": true, "pitch_extractor": "parselmouth", "extract_energy": false, "extract_uv": true, "extract_linear_spec": true, "extract_audio": true, // contentvec "extract_contentvec_feature": true, "contentvec_sample_rate": 16000, "contentvec_batch_size": 1, "contentvec_frameshift": 0.02, // whisper "extract_whisper_feature": true, "whisper_sample_rate": 16000, "whisper_frameshift": 0.01, "whisper_downsample_rate": 2, // Fill in the content-based pretrained model's path "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", "whisper_model": "medium", "whisper_model_path": "pretrained/whisper/medium.pt", // Config for features usage "use_mel": true, "use_frame_pitch": true, "use_uv": true, "use_spkid": true, "use_contentvec": true, "use_whisper": true, "use_text": false, "use_phone": false, // Extract content features using dataloader "pin_memory": true, "num_workers": 8, "content_feature_batch_size": 16, // Meta file "train_file": "train.json", "valid_file": "test.json", "spk2id": "singers.json", "utt2spk": "utt2singer" }, "model": { "condition_encoder": { // Config for features usage "merge_mode": "add", "input_melody_dim": 1, "use_log_f0": true, "n_bins_melody": 256, //# Quantization (0 for not quantization) "output_melody_dim": 192, "use_contentvec": true, "use_whisper": true, "use_mert": false, "use_wenet": false, "whisper_dim": 1024, "contentvec_dim": 256, "content_encoder_dim": 192, "output_singer_dim": 192, "singer_table_size": 512, "output_content_dim": 192, "use_spkid": true, "pitch_max": 1100.0, "pitch_min": 50.0, }, "vits": { "inter_channels": 192, "hidden_channels": 192, "filter_channels": 256, "n_heads": 2, "n_layers": 6, "kernel_size": 3, "p_dropout": 0.1, "ssl_dim": 256, "n_flow_layer": 4, "n_layers_q": 3, "gin_channels": 256, "n_speakers": 512, "use_spectral_norm": false, }, "generator": "nsfhifigan", }, "train": { "batch_size": 32, "learning_rate": 2e-4, "gradient_accumulation_step": 1, "max_epoch": -1, // -1 means no limit "save_checkpoint_stride": [ 3, 50 ], "keep_last": [ 3, 2 ], "run_eval": [ true, true ], "adamw": { "lr": 2.0e-4 }, "reducelronplateau": { "factor": 0.8, "patience": 30, "min_lr": 1.0e-4 }, "dataloader": { "num_worker": 8, "pin_memory": true }, "sampler": { "holistic_shuffle": false, "drop_last": true } }, "inference": { "batch_size": 1, } }