|
{ |
|
"base_config": "config/comosvc.json", |
|
"model_type": "DiffComoSVC", |
|
"dataset": [ |
|
"m4singer", |
|
"opencpop", |
|
"opensinger", |
|
"svcc", |
|
"vctk" |
|
], |
|
"dataset_path": { |
|
|
|
"m4singer": "[M4Singer dataset path]", |
|
"opencpop": "[Opencpop dataset path]", |
|
"opensinger": "[OpenSinger dataset path]", |
|
"svcc": "[SVCC dataset path]", |
|
"vctk": "[VCTK dataset path]" |
|
}, |
|
|
|
"log_dir": "[Your path to save logs and checkpoints]", |
|
"preprocess": { |
|
|
|
"processed_dir": "[Your path to save processed data]", |
|
|
|
"extract_mel": true, |
|
"extract_pitch": true, |
|
"extract_energy": true, |
|
"extract_whisper_feature": true, |
|
"extract_contentvec_feature": true, |
|
"extract_wenet_feature": false, |
|
"whisper_batch_size": 30, |
|
"contentvec_batch_size": 1, |
|
|
|
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", |
|
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", |
|
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", |
|
"whisper_model": "medium", |
|
"whisper_model_path": "pretrained/whisper/medium.pt", |
|
|
|
"use_mel": true, |
|
"use_min_max_norm_mel": true, |
|
"use_frame_pitch": true, |
|
"use_frame_energy": true, |
|
"use_spkid": true, |
|
"use_whisper": true, |
|
"use_contentvec": true, |
|
"use_wenet": false, |
|
"n_mel": 100, |
|
"sample_rate": 24000 |
|
}, |
|
"model": { |
|
"teacher_model_path":"[Your_teacher_model_checkpoint].bin", |
|
"condition_encoder": { |
|
|
|
"use_whisper": true, |
|
"use_contentvec": true, |
|
"use_wenet": false, |
|
"whisper_dim": 1024, |
|
"contentvec_dim": 256, |
|
"wenet_dim": 512, |
|
"use_singer_encoder": false, |
|
"pitch_min": 50, |
|
"pitch_max": 1100 |
|
}, |
|
"comosvc":{ |
|
"distill": false, |
|
|
|
"input_dim": 384, |
|
"output_dim": 100, |
|
"n_heads": 2, |
|
"n_layers": 6, |
|
"filter_channels":512, |
|
"dropout":0.1, |
|
|
|
"P_mean": -1.2, |
|
"P_std": 1.2, |
|
"sigma_data": 0.5, |
|
"sigma_min": 0.002, |
|
"sigma_max": 80, |
|
"rho": 7, |
|
"n_timesteps": 40, |
|
}, |
|
"diffusion": { |
|
|
|
"step_encoder": { |
|
"dim_raw_embedding": 128, |
|
"dim_hidden_layer": 512, |
|
"activation": "SiLU", |
|
"num_layer": 2, |
|
"max_period": 10000 |
|
}, |
|
|
|
"model_type": "bidilconv", |
|
|
|
"bidilconv": { |
|
"base_channel": 384, |
|
"n_res_block": 20, |
|
"conv_kernel_size": 3, |
|
"dilation_cycle_length": 4, |
|
|
|
"conditioner_size": 100 |
|
} |
|
} |
|
}, |
|
"train": { |
|
"batch_size": 64, |
|
"gradient_accumulation_step": 1, |
|
"max_epoch": -1, |
|
"save_checkpoint_stride": [ |
|
50, |
|
50 |
|
], |
|
"keep_last": [ |
|
5, |
|
-1 |
|
], |
|
"run_eval": [ |
|
false, |
|
true |
|
], |
|
"adamw": { |
|
"lr": 4.0e-4 |
|
}, |
|
"reducelronplateau": { |
|
"factor": 0.8, |
|
"patience": 10, |
|
"min_lr": 1.0e-4 |
|
}, |
|
"dataloader": { |
|
"num_worker": 8, |
|
"pin_memory": true |
|
}, |
|
"sampler": { |
|
"holistic_shuffle": false, |
|
"drop_last": true |
|
} |
|
}, |
|
"inference": { |
|
"comosvc": { |
|
"inference_steps": 40 |
|
} |
|
} |
|
} |