{ "base_config": "config/diffusion.json", "dataset": [ "vocalist_l1", ], "exp_name": "vocalist_l1_contentvec+whisper", "inference": { "diffusion": { "scheduler": "pndm", "scheduler_settings": { "num_inference_timesteps": 1000, }, }, }, "model": { "condition_encoder": { "content_encoder_dim": 384, "contentvec_dim": 256, "f0_max": 1100, "f0_min": 50, "input_loudness_dim": 1, "input_melody_dim": 1, "merge_mode": "add", "mert_dim": 256, "n_bins_loudness": 256, "n_bins_melody": 256, "output_content_dim": 384, "output_loudness_dim": 384, "output_melody_dim": 384, "output_singer_dim": 384, "pitch_max": 1100, "pitch_min": 50, "singer_table_size": 512, "use_conformer_for_content_features": false, "use_contentvec": true, "use_log_f0": true, "use_log_loudness": true, "use_mert": false, "use_singer_encoder": true, "use_spkid": true, "use_wenet": false, "use_whisper": true, "wenet_dim": 512, "whisper_dim": 1024, }, "diffusion": { "bidilconv": { "base_channel": 384, "conditioner_size": 384, "conv_kernel_size": 3, "dilation_cycle_length": 4, "n_res_block": 20, }, "model_type": "bidilconv", "scheduler": "ddpm", "scheduler_settings": { "beta_end": 0.02, "beta_schedule": "linear", "beta_start": 0.0001, "num_train_timesteps": 1000, }, "step_encoder": { "activation": "SiLU", "dim_hidden_layer": 512, "dim_raw_embedding": 128, "max_period": 10000, "num_layer": 2, }, "unet2d": { "down_block_types": [ "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D", ], "in_channels": 1, "mid_block_type": "UNetMidBlock2DCrossAttn", "only_cross_attention": false, "out_channels": 1, "up_block_types": [ "UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", ], }, }, }, "model_type": "DiffWaveNetSVC", "preprocess": { "audio_dir": "audios", "bits": 8, "content_feature_batch_size": 16, "contentvec_batch_size": 1, "contentvec_dir": "contentvec", "contentvec_file": "pretrained/contentvec/checkpoint_best_legacy_500.pt", "contentvec_frameshift": 0.02, "contentvec_sample_rate": 16000, "dur_dir": "durs", "duration_dir": "duration", "emo2id": "emo2id.json", "energy_dir": "energys", "extract_audio": false, "extract_contentvec_feature": true, "extract_energy": true, "extract_label": false, "extract_mcep": false, "extract_mel": true, "extract_mert_feature": false, "extract_pitch": true, "extract_uv": true, "extract_wenet_feature": false, "extract_whisper_feature": true, "f0_max": 1100, "f0_min": 50, "file_lst": "file.lst", "fmax": 12000, "fmin": 0, "hop_size": 256, "is_label": true, "is_mu_law": true, "lab_dir": "labs", "label_dir": "labels", "mcep_dir": "mcep", "mel_dir": "mels", "mel_min_max_norm": true, "mel_min_max_stats_dir": "mel_min_max_stats", "mert_dir": "mert", "mert_feature_layer": -1, "mert_frameshit": 0.01333, "mert_hop_size": 320, "mert_model": "m-a-p/MERT-v1-330M", "min_level_db": -115, "mu_law_norm": false, "n_fft": 1024, "n_mel": 100, "num_silent_frames": 8, "num_workers": 8, "phone_seq_file": "phone_seq_file", "pin_memory": true, "pitch_bin": 256, "pitch_dir": "pitches", "pitch_extractor": "parselmouth", "pitch_max": 1100.0, "pitch_min": 50.0, "processed_dir": "ckpts/svc/vocalist_l1_contentvec+whisper/data", "ref_level_db": 20, "sample_rate": 24000, "spk2id": "singers.json", "train_file": "train.json", "trim_fft_size": 512, "trim_hop_size": 128, "trim_silence": false, "trim_top_db": 30, "trimmed_wav_dir": "trimmed_wavs", "use_audio": false, "use_contentvec": true, "use_dur": false, "use_emoid": false, "use_frame_duration": false, "use_frame_energy": true, "use_frame_pitch": true, "use_lab": false, "use_label": false, "use_log_scale_energy": false, "use_log_scale_pitch": false, "use_mel": true, "use_mert": false, "use_min_max_norm_mel": true, "use_one_hot": false, "use_phn_seq": false, "use_phone_duration": false, "use_phone_energy": false, "use_phone_pitch": false, "use_spkid": true, "use_uv": true, "use_wav": false, "use_wenet": false, "use_whisper": true, "utt2emo": "utt2emo", "utt2spk": "utt2singer", "uv_dir": "uvs", "valid_file": "test.json", "wav_dir": "wavs", "wenet_batch_size": 1, "wenet_config": "pretrained/wenet/20220506_u2pp_conformer_exp/train.yaml", "wenet_dir": "wenet", "wenet_downsample_rate": 4, "wenet_frameshift": 0.01, "wenet_model_path": "pretrained/wenet/20220506_u2pp_conformer_exp/final.pt", "wenet_sample_rate": 16000, "whisper_batch_size": 30, "whisper_dir": "whisper", "whisper_downsample_rate": 2, "whisper_frameshift": 0.01, "whisper_model": "medium", "whisper_model_path": "pretrained/whisper/medium.pt", "win_size": 1024, }, "supported_model_type": [ "Fastspeech2", "DiffSVC", "Transformer", "EDM", "CD", ], "train": { "adamw": { "lr": 0.0004, }, "batch_size": 32, "dataloader": { "num_worker": 8, "pin_memory": true, }, "ddp": true, "epochs": 50000, "gradient_accumulation_step": 1, "keep_checkpoint_max": 5, "keep_last": [ 5, -1, ], "max_epoch": -1, "max_steps": 1000000, "multi_speaker_training": false, "optimizer": "AdamW", "random_seed": 10086, "reducelronplateau": { "factor": 0.8, "min_lr": 0.0001, "patience": 10, }, "run_eval": [ false, true, ], "sampler": { "drop_last": true, "holistic_shuffle": false, }, "save_checkpoint_stride": [ 3, 10, ], "save_checkpoints_steps": 10000, "save_summary_steps": 500, "scheduler": "ReduceLROnPlateau", "total_training_steps": 50000, "tracker": [ "tensorboard", ], "valid_interval": 10000, }, "use_custom_dataset": true, }