Spaces:
Sleeping
Sleeping
{ | |
"base_config": "config/base.json", | |
"model_type": "DiffComoSVC", | |
"task_type": "svc", | |
"use_custom_dataset": false, | |
"preprocess": { | |
// data augmentations | |
"use_pitch_shift": false, | |
"use_formant_shift": false, | |
"use_time_stretch": false, | |
"use_equalizer": false, | |
// acoustic features | |
"extract_mel": true, | |
"mel_min_max_norm": true, | |
"extract_pitch": true, | |
"pitch_extractor": "parselmouth", | |
"extract_uv": true, | |
"extract_energy": true, | |
// content features | |
"extract_whisper_feature": false, | |
"whisper_sample_rate": 16000, | |
"extract_contentvec_feature": false, | |
"contentvec_sample_rate": 16000, | |
"extract_wenet_feature": false, | |
"wenet_sample_rate": 16000, | |
"extract_mert_feature": false, | |
"mert_sample_rate": 16000, | |
// Default config for whisper | |
"whisper_frameshift": 0.01, | |
"whisper_downsample_rate": 2, | |
// Default config for content vector | |
"contentvec_frameshift": 0.02, | |
// Default config for mert | |
"mert_model": "m-a-p/MERT-v1-330M", | |
"mert_feature_layer": -1, | |
"mert_hop_size": 320, | |
// 24k | |
"mert_frameshit": 0.01333, | |
// 10ms | |
"wenet_frameshift": 0.01, | |
// wenetspeech is 4, gigaspeech is 6 | |
"wenet_downsample_rate": 4, | |
// Default config | |
"n_mel": 100, | |
"win_size": 1024, | |
// todo | |
"hop_size": 256, | |
"sample_rate": 24000, | |
"n_fft": 1024, | |
// todo | |
"fmin": 0, | |
"fmax": 12000, | |
// todo | |
"f0_min": 50, | |
// ~C2 | |
"f0_max": 1100, | |
//1100, // ~C6(1100), ~G5(800) | |
"pitch_bin": 256, | |
"pitch_max": 1100.0, | |
"pitch_min": 50.0, | |
"is_label": true, | |
"is_mu_law": true, | |
"bits": 8, | |
"mel_min_max_stats_dir": "mel_min_max_stats", | |
"whisper_dir": "whisper", | |
"contentvec_dir": "contentvec", | |
"wenet_dir": "wenet", | |
"mert_dir": "mert", | |
// Extract content features using dataloader | |
"pin_memory": true, | |
"num_workers": 8, | |
"content_feature_batch_size": 16, | |
// Features used for model training | |
"use_mel": true, | |
"use_min_max_norm_mel": true, | |
"use_frame_pitch": true, | |
"use_uv": true, | |
"use_frame_energy": true, | |
"use_log_scale_pitch": false, | |
"use_log_scale_energy": false, | |
"use_spkid": true, | |
// Meta file | |
"train_file": "train.json", | |
"valid_file": "test.json", | |
"spk2id": "singers.json", | |
"utt2spk": "utt2singer" | |
}, | |
"model": { | |
"teacher_model_path": "[Your Teacher Model Path].bin", | |
"condition_encoder": { | |
"merge_mode": "add", | |
"input_melody_dim": 1, | |
"use_log_f0": true, | |
"n_bins_melody": 256, | |
//# Quantization (0 for not quantization) | |
"output_melody_dim": 384, | |
"input_loudness_dim": 1, | |
"use_log_loudness": true, | |
"n_bins_loudness": 256, | |
"output_loudness_dim": 384, | |
"use_whisper": false, | |
"use_contentvec": false, | |
"use_wenet": false, | |
"use_mert": false, | |
"whisper_dim": 1024, | |
"contentvec_dim": 256, | |
"mert_dim": 256, | |
"wenet_dim": 512, | |
"content_encoder_dim": 384, | |
"output_singer_dim": 384, | |
"singer_table_size": 512, | |
"output_content_dim": 384, | |
"use_spkid": true | |
}, | |
"comosvc": { | |
"distill": false, | |
// conformer encoder | |
"input_dim": 384, | |
"output_dim": 100, | |
"n_heads": 2, | |
"n_layers": 6, | |
"filter_channels": 512, | |
"dropout": 0.1, | |
// karras diffusion | |
"P_mean": -1.2, | |
"P_std": 1.2, | |
"sigma_data": 0.5, | |
"sigma_min": 0.002, | |
"sigma_max": 80, | |
"rho": 7, | |
"n_timesteps": 40, | |
}, | |
"diffusion": { | |
// Diffusion steps encoder | |
"step_encoder": { | |
"dim_raw_embedding": 128, | |
"dim_hidden_layer": 512, | |
"activation": "SiLU", | |
"num_layer": 2, | |
"max_period": 10000 | |
}, | |
// Diffusion decoder | |
"model_type": "bidilconv", | |
// bidilconv, unet2d, TODO: unet1d | |
"bidilconv": { | |
"base_channel": 384, | |
"n_res_block": 20, | |
"conv_kernel_size": 3, | |
"dilation_cycle_length": 4, | |
// specially, 1 means no dilation | |
"conditioner_size": 100 | |
} | |
}, | |
}, | |
"train": { | |
// Basic settings | |
"fast_steps": 0, | |
"batch_size": 32, | |
"gradient_accumulation_step": 1, | |
"max_epoch": -1, | |
// -1 means no limit | |
"save_checkpoint_stride": [ | |
10, | |
100 | |
], | |
// unit is epoch | |
"keep_last": [ | |
3, | |
-1 | |
], | |
// -1 means infinite, if one number will broadcast | |
"run_eval": [ | |
false, | |
true | |
], | |
// if one number will broadcast | |
// Fix the random seed | |
"random_seed": 10086, | |
// Batchsampler | |
"sampler": { | |
"holistic_shuffle": true, | |
"drop_last": true | |
}, | |
// Dataloader | |
"dataloader": { | |
"num_worker": 32, | |
"pin_memory": true | |
}, | |
// Trackers | |
"tracker": [ | |
"tensorboard" | |
// "wandb", | |
// "cometml", | |
// "mlflow", | |
], | |
// Optimizer | |
"optimizer": "AdamW", | |
"adamw": { | |
"lr": 4.0e-4 | |
// nn model lr | |
}, | |
// LR Scheduler | |
"scheduler": "ReduceLROnPlateau", | |
"reducelronplateau": { | |
"factor": 0.8, | |
"patience": 10, | |
// unit is epoch | |
"min_lr": 1.0e-4 | |
} | |
}, | |
"inference": { | |
"comosvc": { | |
"inference_steps": 40 | |
} | |
} | |
} |