RMSnow's picture
vocalist contentvec and whisper as content features
6e14f3c
{
"base_config": "config/diffusion.json",
"dataset": [
"vocalist_l1",
],
"exp_name": "vocalist_l1_contentvec+whisper",
"inference": {
"diffusion": {
"scheduler": "pndm",
"scheduler_settings": {
"num_inference_timesteps": 1000,
},
},
},
"model": {
"condition_encoder": {
"content_encoder_dim": 384,
"contentvec_dim": 256,
"f0_max": 1100,
"f0_min": 50,
"input_loudness_dim": 1,
"input_melody_dim": 1,
"merge_mode": "add",
"mert_dim": 256,
"n_bins_loudness": 256,
"n_bins_melody": 256,
"output_content_dim": 384,
"output_loudness_dim": 384,
"output_melody_dim": 384,
"output_singer_dim": 384,
"pitch_max": 1100,
"pitch_min": 50,
"singer_table_size": 512,
"use_conformer_for_content_features": false,
"use_contentvec": true,
"use_log_f0": true,
"use_log_loudness": true,
"use_mert": false,
"use_singer_encoder": true,
"use_spkid": true,
"use_wenet": false,
"use_whisper": true,
"wenet_dim": 512,
"whisper_dim": 1024,
},
"diffusion": {
"bidilconv": {
"base_channel": 384,
"conditioner_size": 384,
"conv_kernel_size": 3,
"dilation_cycle_length": 4,
"n_res_block": 20,
},
"model_type": "bidilconv",
"scheduler": "ddpm",
"scheduler_settings": {
"beta_end": 0.02,
"beta_schedule": "linear",
"beta_start": 0.0001,
"num_train_timesteps": 1000,
},
"step_encoder": {
"activation": "SiLU",
"dim_hidden_layer": 512,
"dim_raw_embedding": 128,
"max_period": 10000,
"num_layer": 2,
},
"unet2d": {
"down_block_types": [
"CrossAttnDownBlock2D",
"CrossAttnDownBlock2D",
"CrossAttnDownBlock2D",
"DownBlock2D",
],
"in_channels": 1,
"mid_block_type": "UNetMidBlock2DCrossAttn",
"only_cross_attention": false,
"out_channels": 1,
"up_block_types": [
"UpBlock2D",
"CrossAttnUpBlock2D",
"CrossAttnUpBlock2D",
"CrossAttnUpBlock2D",
],
},
},
},
"model_type": "DiffWaveNetSVC",
"preprocess": {
"audio_dir": "audios",
"bits": 8,
"content_feature_batch_size": 16,
"contentvec_batch_size": 1,
"contentvec_dir": "contentvec",
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt",
"contentvec_frameshift": 0.02,
"contentvec_sample_rate": 16000,
"dur_dir": "durs",
"duration_dir": "duration",
"emo2id": "emo2id.json",
"energy_dir": "energys",
"extract_audio": false,
"extract_contentvec_feature": true,
"extract_energy": true,
"extract_label": false,
"extract_mcep": false,
"extract_mel": true,
"extract_mert_feature": false,
"extract_pitch": true,
"extract_uv": true,
"extract_wenet_feature": false,
"extract_whisper_feature": true,
"f0_max": 1100,
"f0_min": 50,
"file_lst": "file.lst",
"fmax": 12000,
"fmin": 0,
"hop_size": 256,
"is_label": true,
"is_mu_law": true,
"lab_dir": "labs",
"label_dir": "labels",
"mcep_dir": "mcep",
"mel_dir": "mels",
"mel_min_max_norm": true,
"mel_min_max_stats_dir": "mel_min_max_stats",
"mert_dir": "mert",
"mert_feature_layer": -1,
"mert_frameshit": 0.01333,
"mert_hop_size": 320,
"mert_model": "m-a-p/MERT-v1-330M",
"min_level_db": -115,
"mu_law_norm": false,
"n_fft": 1024,
"n_mel": 100,
"num_silent_frames": 8,
"num_workers": 8,
"phone_seq_file": "phone_seq_file",
"pin_memory": true,
"pitch_bin": 256,
"pitch_dir": "pitches",
"pitch_extractor": "parselmouth",
"pitch_max": 1100.0,
"pitch_min": 50.0,
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data",
"ref_level_db": 20,
"sample_rate": 24000,
"spk2id": "singers.json",
"train_file": "train.json",
"trim_fft_size": 512,
"trim_hop_size": 128,
"trim_silence": false,
"trim_top_db": 30,
"trimmed_wav_dir": "trimmed_wavs",
"use_audio": false,
"use_contentvec": true,
"use_dur": false,
"use_emoid": false,
"use_frame_duration": false,
"use_frame_energy": true,
"use_frame_pitch": true,
"use_lab": false,
"use_label": false,
"use_log_scale_energy": false,
"use_log_scale_pitch": false,
"use_mel": true,
"use_mert": false,
"use_min_max_norm_mel": true,
"use_one_hot": false,
"use_phn_seq": false,
"use_phone_duration": false,
"use_phone_energy": false,
"use_phone_pitch": false,
"use_spkid": true,
"use_uv": true,
"use_wav": false,
"use_wenet": false,
"use_whisper": true,
"utt2emo": "utt2emo",
"utt2spk": "utt2singer",
"uv_dir": "uvs",
"valid_file": "test.json",
"wav_dir": "wavs",
"wenet_batch_size": 1,
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml",
"wenet_dir": "wenet",
"wenet_downsample_rate": 4,
"wenet_frameshift": 0.01,
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt",
"wenet_sample_rate": 16000,
"whisper_batch_size": 30,
"whisper_dir": "whisper",
"whisper_downsample_rate": 2,
"whisper_frameshift": 0.01,
"whisper_model": "medium",
"whisper_model_path": "pretrained/whisper/medium.pt",
"win_size": 1024,
},
"supported_model_type": [
"Fastspeech2",
"DiffSVC",
"Transformer",
"EDM",
"CD",
],
"train": {
"adamw": {
"lr": 0.0004,
},
"batch_size": 32,
"dataloader": {
"num_worker": 8,
"pin_memory": true,
},
"ddp": true,
"epochs": 50000,
"gradient_accumulation_step": 1,
"keep_checkpoint_max": 5,
"keep_last": [
5,
-1,
],
"max_epoch": -1,
"max_steps": 1000000,
"multi_speaker_training": false,
"optimizer": "AdamW",
"random_seed": 10086,
"reducelronplateau": {
"factor": 0.8,
"min_lr": 0.0001,
"patience": 10,
},
"run_eval": [
false,
true,
],
"sampler": {
"drop_last": true,
"holistic_shuffle": false,
},
"save_checkpoint_stride": [
3,
10,
],
"save_checkpoints_steps": 10000,
"save_summary_steps": 500,
"scheduler": "ReduceLROnPlateau",
"total_training_steps": 50000,
"tracker": [
"tensorboard",
],
"valid_interval": 10000,
},
"use_custom_dataset": true,
}