Spaces:

amphion
/

NaturalSpeech2

Running on T4

yuancwang

init

b725c5a 11 months ago

6.8 kB

	{
	// FIXME: THESE ARE LEGACY
	"base_config": "config/base.json",
	"model_type": "diffusion",
	"task_type": "svc",
	"use_custom_dataset": false,
	"preprocess": {
	// data augmentations
	"use_pitch_shift": false,
	"use_formant_shift": false,
	"use_time_stretch": false,
	"use_equalizer": false,
	// acoustic features
	"extract_mel": true,
	"mel_min_max_norm": true,
	"extract_pitch": true,
	"pitch_extractor": "parselmouth",
	"extract_uv": true,
	"extract_energy": true,
	// content features
	"extract_whisper_feature": false,
	"whisper_sample_rate": 16000,
	"extract_contentvec_feature": false,
	"contentvec_sample_rate": 16000,
	"extract_wenet_feature": false,
	"wenet_sample_rate": 16000,
	"extract_mert_feature": false,
	"mert_sample_rate": 16000,
	// Default config for whisper
	"whisper_frameshift": 0.01,
	"whisper_downsample_rate": 2,
	// Default config for content vector
	"contentvec_frameshift": 0.02,
	// Default config for mert
	"mert_model": "m-a-p/MERT-v1-330M",
	"mert_feature_layer": -1,
	"mert_hop_size": 320,
	// 24k
	"mert_frameshit": 0.01333,
	// 10ms
	"wenet_frameshift": 0.01,
	// wenetspeech is 4, gigaspeech is 6
	"wenet_downsample_rate": 4,
	// Default config
	"n_mel": 100,
	"win_size": 1024,
	// todo
	"hop_size": 256,
	"sample_rate": 24000,
	"n_fft": 1024,
	// todo
	"fmin": 0,
	"fmax": 12000,
	// todo
	"f0_min": 50,
	// ~C2
	"f0_max": 1100,
	//1100, // ~C6(1100), ~G5(800)
	"pitch_bin": 256,
	"pitch_max": 1100.0,
	"pitch_min": 50.0,
	"is_label": true,
	"is_mu_law": true,
	"bits": 8,
	"mel_min_max_stats_dir": "mel_min_max_stats",
	"whisper_dir": "whisper",
	"contentvec_dir": "contentvec",
	"wenet_dir": "wenet",
	"mert_dir": "mert",
	// Extract content features using dataloader
	"pin_memory": true,
	"num_workers": 8,
	"content_feature_batch_size": 16,
	// Features used for model training
	"use_mel": true,
	"use_min_max_norm_mel": true,
	"use_frame_pitch": true,
	"use_uv": true,
	"use_frame_energy": true,
	"use_log_scale_pitch": false,
	"use_log_scale_energy": false,
	"use_spkid": true,
	// Meta file
	"train_file": "train.json",
	"valid_file": "test.json",
	"spk2id": "singers.json",
	"utt2spk": "utt2singer"
	},
	"model": {
	"condition_encoder": {
	"merge_mode": "add",
	"input_melody_dim": 1,
	"use_log_f0": true,
	"n_bins_melody": 256,
	//# Quantization (0 for not quantization)
	"output_melody_dim": 384,
	"input_loudness_dim": 1,
	"use_log_loudness": true,
	"n_bins_loudness": 256,
	"output_loudness_dim": 384,
	"use_whisper": false,
	"use_contentvec": false,
	"use_wenet": false,
	"use_mert": false,
	"whisper_dim": 1024,
	"contentvec_dim": 256,
	"mert_dim": 256,
	"wenet_dim": 512,
	"content_encoder_dim": 384,
	"output_singer_dim": 384,
	"singer_table_size": 512,
	"output_content_dim": 384,
	"use_spkid": true
	},
	// FIXME: FOLLOWING ARE NEW!!
	"diffusion": {
	"scheduler": "ddpm",
	"scheduler_settings": {
	"num_train_timesteps": 1000,
	"beta_start": 1.0e-4,
	"beta_end": 0.02,
	"beta_schedule": "linear"
	},
	// Diffusion steps encoder
	"step_encoder": {
	"dim_raw_embedding": 128,
	"dim_hidden_layer": 512,
	"activation": "SiLU",
	"num_layer": 2,
	"max_period": 10000
	},
	// Diffusion decoder
	"model_type": "bidilconv",
	// bidilconv, unet2d, TODO: unet1d
	"bidilconv": {
	"base_channel": 384,
	"n_res_block": 20,
	"conv_kernel_size": 3,
	"dilation_cycle_length": 4,
	// specially, 1 means no dilation
	"conditioner_size": 384
	},
	"unet2d": {
	"in_channels": 1,
	"out_channels": 1,
	"down_block_types": [
	"CrossAttnDownBlock2D",
	"CrossAttnDownBlock2D",
	"CrossAttnDownBlock2D",
	"DownBlock2D"
	],
	"mid_block_type": "UNetMidBlock2DCrossAttn",
	"up_block_types": [
	"UpBlock2D",
	"CrossAttnUpBlock2D",
	"CrossAttnUpBlock2D",
	"CrossAttnUpBlock2D"
	],
	"only_cross_attention": false
	}
	}
	},
	// FIXME: FOLLOWING ARE NEW!!
	"train": {
	// Basic settings
	"batch_size": 64,
	"gradient_accumulation_step": 1,
	"max_epoch": -1,
	// -1 means no limit
	"save_checkpoint_stride": [
	5,
	20
	],
	// unit is epoch
	"keep_last": [
	3,
	-1
	],
	// -1 means infinite, if one number will broadcast
	"run_eval": [
	false,
	true
	],
	// if one number will broadcast
	// Fix the random seed
	"random_seed": 10086,
	// Batchsampler
	"sampler": {
	"holistic_shuffle": true,
	"drop_last": true
	},
	// Dataloader
	"dataloader": {
	"num_worker": 32,
	"pin_memory": true
	},
	// Trackers
	"tracker": [
	"tensorboard"
	// "wandb",
	// "cometml",
	// "mlflow",
	],
	// Optimizer
	"optimizer": "AdamW",
	"adamw": {
	"lr": 4.0e-4
	// nn model lr
	},
	// LR Scheduler
	"scheduler": "ReduceLROnPlateau",
	"reducelronplateau": {
	"factor": 0.8,
	"patience": 10,
	// unit is epoch
	"min_lr": 1.0e-4
	}
	},
	"inference": {
	"diffusion": {
	"scheduler": "pndm",
	"scheduler_settings": {
	"num_inference_timesteps": 1000
	}
	}
	}
	}