|
{ |
|
"base_config": "config/diffusion.json", |
|
"dataset": [ |
|
"vocalist_l1", |
|
], |
|
"exp_name": "vocalist_l1_contentvec+whisper", |
|
"inference": { |
|
"diffusion": { |
|
"scheduler": "pndm", |
|
"scheduler_settings": { |
|
"num_inference_timesteps": 1000, |
|
}, |
|
}, |
|
}, |
|
"model": { |
|
"condition_encoder": { |
|
"content_encoder_dim": 384, |
|
"contentvec_dim": 256, |
|
"f0_max": 1100, |
|
"f0_min": 50, |
|
"input_loudness_dim": 1, |
|
"input_melody_dim": 1, |
|
"merge_mode": "add", |
|
"mert_dim": 256, |
|
"n_bins_loudness": 256, |
|
"n_bins_melody": 256, |
|
"output_content_dim": 384, |
|
"output_loudness_dim": 384, |
|
"output_melody_dim": 384, |
|
"output_singer_dim": 384, |
|
"pitch_max": 1100, |
|
"pitch_min": 50, |
|
"singer_table_size": 512, |
|
"use_conformer_for_content_features": false, |
|
"use_contentvec": true, |
|
"use_log_f0": true, |
|
"use_log_loudness": true, |
|
"use_mert": false, |
|
"use_singer_encoder": true, |
|
"use_spkid": true, |
|
"use_wenet": false, |
|
"use_whisper": true, |
|
"wenet_dim": 512, |
|
"whisper_dim": 1024, |
|
}, |
|
"diffusion": { |
|
"bidilconv": { |
|
"base_channel": 384, |
|
"conditioner_size": 384, |
|
"conv_kernel_size": 3, |
|
"dilation_cycle_length": 4, |
|
"n_res_block": 20, |
|
}, |
|
"model_type": "bidilconv", |
|
"scheduler": "ddpm", |
|
"scheduler_settings": { |
|
"beta_end": 0.02, |
|
"beta_schedule": "linear", |
|
"beta_start": 0.0001, |
|
"num_train_timesteps": 1000, |
|
}, |
|
"step_encoder": { |
|
"activation": "SiLU", |
|
"dim_hidden_layer": 512, |
|
"dim_raw_embedding": 128, |
|
"max_period": 10000, |
|
"num_layer": 2, |
|
}, |
|
"unet2d": { |
|
"down_block_types": [ |
|
"CrossAttnDownBlock2D", |
|
"CrossAttnDownBlock2D", |
|
"CrossAttnDownBlock2D", |
|
"DownBlock2D", |
|
], |
|
"in_channels": 1, |
|
"mid_block_type": "UNetMidBlock2DCrossAttn", |
|
"only_cross_attention": false, |
|
"out_channels": 1, |
|
"up_block_types": [ |
|
"UpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
"CrossAttnUpBlock2D", |
|
], |
|
}, |
|
}, |
|
}, |
|
"model_type": "DiffWaveNetSVC", |
|
"preprocess": { |
|
"audio_dir": "audios", |
|
"bits": 8, |
|
"content_feature_batch_size": 16, |
|
"contentvec_batch_size": 1, |
|
"contentvec_dir": "contentvec", |
|
"contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", |
|
"contentvec_frameshift": 0.02, |
|
"contentvec_sample_rate": 16000, |
|
"dur_dir": "durs", |
|
"duration_dir": "duration", |
|
"emo2id": "emo2id.json", |
|
"energy_dir": "energys", |
|
"extract_audio": false, |
|
"extract_contentvec_feature": true, |
|
"extract_energy": true, |
|
"extract_label": false, |
|
"extract_mcep": false, |
|
"extract_mel": true, |
|
"extract_mert_feature": false, |
|
"extract_pitch": true, |
|
"extract_uv": true, |
|
"extract_wenet_feature": false, |
|
"extract_whisper_feature": true, |
|
"f0_max": 1100, |
|
"f0_min": 50, |
|
"file_lst": "file.lst", |
|
"fmax": 12000, |
|
"fmin": 0, |
|
"hop_size": 256, |
|
"is_label": true, |
|
"is_mu_law": true, |
|
"lab_dir": "labs", |
|
"label_dir": "labels", |
|
"mcep_dir": "mcep", |
|
"mel_dir": "mels", |
|
"mel_min_max_norm": true, |
|
"mel_min_max_stats_dir": "mel_min_max_stats", |
|
"mert_dir": "mert", |
|
"mert_feature_layer": -1, |
|
"mert_frameshit": 0.01333, |
|
"mert_hop_size": 320, |
|
"mert_model": "m-a-p/MERT-v1-330M", |
|
"min_level_db": -115, |
|
"mu_law_norm": false, |
|
"n_fft": 1024, |
|
"n_mel": 100, |
|
"num_silent_frames": 8, |
|
"num_workers": 8, |
|
"phone_seq_file": "phone_seq_file", |
|
"pin_memory": true, |
|
"pitch_bin": 256, |
|
"pitch_dir": "pitches", |
|
"pitch_extractor": "parselmouth", |
|
"pitch_max": 1100.0, |
|
"pitch_min": 50.0, |
|
"processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data", |
|
"ref_level_db": 20, |
|
"sample_rate": 24000, |
|
"spk2id": "singers.json", |
|
"train_file": "train.json", |
|
"trim_fft_size": 512, |
|
"trim_hop_size": 128, |
|
"trim_silence": false, |
|
"trim_top_db": 30, |
|
"trimmed_wav_dir": "trimmed_wavs", |
|
"use_audio": false, |
|
"use_contentvec": true, |
|
"use_dur": false, |
|
"use_emoid": false, |
|
"use_frame_duration": false, |
|
"use_frame_energy": true, |
|
"use_frame_pitch": true, |
|
"use_lab": false, |
|
"use_label": false, |
|
"use_log_scale_energy": false, |
|
"use_log_scale_pitch": false, |
|
"use_mel": true, |
|
"use_mert": false, |
|
"use_min_max_norm_mel": true, |
|
"use_one_hot": false, |
|
"use_phn_seq": false, |
|
"use_phone_duration": false, |
|
"use_phone_energy": false, |
|
"use_phone_pitch": false, |
|
"use_spkid": true, |
|
"use_uv": true, |
|
"use_wav": false, |
|
"use_wenet": false, |
|
"use_whisper": true, |
|
"utt2emo": "utt2emo", |
|
"utt2spk": "utt2singer", |
|
"uv_dir": "uvs", |
|
"valid_file": "test.json", |
|
"wav_dir": "wavs", |
|
"wenet_batch_size": 1, |
|
"wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", |
|
"wenet_dir": "wenet", |
|
"wenet_downsample_rate": 4, |
|
"wenet_frameshift": 0.01, |
|
"wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", |
|
"wenet_sample_rate": 16000, |
|
"whisper_batch_size": 30, |
|
"whisper_dir": "whisper", |
|
"whisper_downsample_rate": 2, |
|
"whisper_frameshift": 0.01, |
|
"whisper_model": "medium", |
|
"whisper_model_path": "pretrained/whisper/medium.pt", |
|
"win_size": 1024, |
|
}, |
|
"supported_model_type": [ |
|
"Fastspeech2", |
|
"DiffSVC", |
|
"Transformer", |
|
"EDM", |
|
"CD", |
|
], |
|
"train": { |
|
"adamw": { |
|
"lr": 0.0004, |
|
}, |
|
"batch_size": 32, |
|
"dataloader": { |
|
"num_worker": 8, |
|
"pin_memory": true, |
|
}, |
|
"ddp": true, |
|
"epochs": 50000, |
|
"gradient_accumulation_step": 1, |
|
"keep_checkpoint_max": 5, |
|
"keep_last": [ |
|
5, |
|
-1, |
|
], |
|
"max_epoch": -1, |
|
"max_steps": 1000000, |
|
"multi_speaker_training": false, |
|
"optimizer": "AdamW", |
|
"random_seed": 10086, |
|
"reducelronplateau": { |
|
"factor": 0.8, |
|
"min_lr": 0.0001, |
|
"patience": 10, |
|
}, |
|
"run_eval": [ |
|
false, |
|
true, |
|
], |
|
"sampler": { |
|
"drop_last": true, |
|
"holistic_shuffle": false, |
|
}, |
|
"save_checkpoint_stride": [ |
|
3, |
|
10, |
|
], |
|
"save_checkpoints_steps": 10000, |
|
"save_summary_steps": 500, |
|
"scheduler": "ReduceLROnPlateau", |
|
"total_training_steps": 50000, |
|
"tracker": [ |
|
"tensorboard", |
|
], |
|
"valid_interval": 10000, |
|
}, |
|
"use_custom_dataset": true, |
|
} |