NaturalSpeech2 / config /base.json
yuancwang
init
b725c5a
raw
history blame
6.64 kB
{
"supported_model_type": [
"GANVocoder",
"Fastspeech2",
"DiffSVC",
"Transformer",
"EDM",
"CD"
],
"task_type": "",
"dataset": [],
"use_custom_dataset": false,
"preprocess": {
"phone_extractor": "espeak", // "espeak, pypinyin, pypinyin_initials_finals, lexicon"
// trim audio silence
"data_augment": false,
"trim_silence": false,
"num_silent_frames": 8,
"trim_fft_size": 512, // fft size used in trimming
"trim_hop_size": 128, // hop size used in trimming
"trim_top_db": 30, // top db used in trimming sensitive to each dataset
// acoustic features
"extract_mel": false,
"mel_extract_mode": "",
"extract_linear_spec": false,
"extract_mcep": false,
"extract_pitch": false,
"extract_acoustic_token": false,
"pitch_remove_outlier": false,
"extract_uv": false,
"pitch_norm": false,
"extract_audio": false,
"extract_label": false,
"pitch_extractor": "parselmouth", // pyin, dio, pyworld, pyreaper, parselmouth, CWT (Continuous Wavelet Transform)
"extract_energy": false,
"energy_remove_outlier": false,
"energy_norm": false,
"energy_extract_mode": "from_mel",
"extract_duration": false,
"extract_amplitude_phase": false,
"mel_min_max_norm": false,
// lingusitic features
"extract_phone": false,
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
// content features
"extract_whisper_feature": false,
"extract_contentvec_feature": false,
"extract_mert_feature": false,
"extract_wenet_feature": false,
// Settings for data preprocessing
"n_mel": 80,
"win_size": 480,
"hop_size": 120,
"sample_rate": 24000,
"n_fft": 1024,
"fmin": 0,
"fmax": 12000,
"min_level_db": -115,
"ref_level_db": 20,
"bits": 8,
// Directory names of processed data or extracted features
"processed_dir": "processed_data",
"trimmed_wav_dir": "trimmed_wavs", // directory name of silence trimed wav
"raw_data": "raw_data",
"phone_dir": "phones",
"wav_dir": "wavs", // directory name of processed wav (such as downsampled waveform)
"audio_dir": "audios",
"log_amplitude_dir": "log_amplitudes",
"phase_dir": "phases",
"real_dir": "reals",
"imaginary_dir": "imaginarys",
"label_dir": "labels",
"linear_dir": "linears",
"mel_dir": "mels", // directory name of extraced mel features
"mcep_dir": "mcep", // directory name of extraced mcep features
"dur_dir": "durs",
"symbols_dict": "symbols.dict",
"lab_dir": "labs", // directory name of extraced label features
"wenet_dir": "wenet", // directory name of extraced wenet features
"contentvec_dir": "contentvec", // directory name of extraced wenet features
"pitch_dir": "pitches", // directory name of extraced pitch features
"energy_dir": "energys", // directory name of extracted energy features
"phone_pitch_dir": "phone_pitches", // directory name of extraced pitch features
"phone_energy_dir": "phone_energys", // directory name of extracted energy features
"uv_dir": "uvs", // directory name of extracted unvoiced features
"duration_dir": "duration", // ground-truth duration file
"phone_seq_file": "phone_seq_file", // phoneme sequence file
"file_lst": "file.lst",
"train_file": "train.json", // training set, the json file contains detailed information about the dataset, including dataset name, utterance id, duration of the utterance
"valid_file": "valid.json", // validattion set
"spk2id": "spk2id.json", // used for multi-speaker dataset
"utt2spk": "utt2spk", // used for multi-speaker dataset
"emo2id": "emo2id.json", // used for multi-emotion dataset
"utt2emo": "utt2emo", // used for multi-emotion dataset
// Features used for model training
"use_text": false,
"use_phone": false,
"use_phn_seq": false,
"use_lab": false,
"use_linear": false,
"use_mel": false,
"use_min_max_norm_mel": false,
"use_wav": false,
"use_phone_pitch": false,
"use_log_scale_pitch": false,
"use_phone_energy": false,
"use_phone_duration": false,
"use_log_scale_energy": false,
"use_wenet": false,
"use_dur": false,
"use_spkid": false, // True: use speaker id for multi-speaker dataset
"use_emoid": false, // True: use emotion id for multi-emotion dataset
"use_frame_pitch": false,
"use_uv": false,
"use_frame_energy": false,
"use_frame_duration": false,
"use_audio": false,
"use_label": false,
"use_one_hot": false,
"use_amplitude_phase": false,
"data_augment": false,
"align_mel_duration": false
},
"train": {
"ddp": true,
"random_seed": 970227,
"batch_size": 16,
"max_steps": 1000000,
// Trackers
"tracker": [
"tensorboard"
// "wandb",
// "cometml",
// "mlflow",
],
"max_epoch": -1,
// -1 means no limit
"save_checkpoint_stride": [
5,
20
],
// unit is epoch
"keep_last": [
3,
-1
],
// -1 means infinite, if one number will broadcast
"run_eval": [
false,
true
],
// if one number will broadcast
// Fix the random seed
"random_seed": 10086,
// Optimizer
"optimizer": "AdamW",
"adamw": {
"lr": 4.0e-4
// nn model lr
},
// LR Scheduler
"scheduler": "ReduceLROnPlateau",
"reducelronplateau": {
"factor": 0.8,
"patience": 10,
// unit is epoch
"min_lr": 1.0e-4
},
// Batchsampler
"sampler": {
"holistic_shuffle": true,
"drop_last": true
},
// Dataloader
"dataloader": {
"num_worker": 32,
"pin_memory": true
},
"gradient_accumulation_step": 1,
"total_training_steps": 50000,
"save_summary_steps": 500,
"save_checkpoints_steps": 10000,
"valid_interval": 10000,
"keep_checkpoint_max": 5,
"multi_speaker_training": false, // True: train multi-speaker model; False: training single-speaker model;
"max_epoch": -1,
// -1 means no limit
"save_checkpoint_stride": [
5,
20
],
// unit is epoch
"keep_last": [
3,
-1
],
// -1 means infinite, if one number will broadcast
"run_eval": [
false,
true
],
// Batchsampler
"sampler": {
"holistic_shuffle": true,
"drop_last": true
},
// Dataloader
"dataloader": {
"num_worker": 32,
"pin_memory": true
},
// Trackers
"tracker": [
"tensorboard"
// "wandb",
// "cometml",
// "mlflow",
],
},
}