File size: 5,697 Bytes
6ee88e6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
{
"base_config": "config/valle.json",
"dataset": [
"LibriTTS",
],
"dataset_path": {
"LibriTTS": "/mnt/data1/xueliumeng/processed_data_valle_libritts/",
},
"exp_name": "amphion_libritts_valle_nar_gpus_6_adamw_coswarmup_16k_ar_ckpt_85epoch",
"log_dir": "/mnt/data1/wangyuancheng/valle_debug",
"model": {
"add_prenet": false,
"decoder_dim": 1024,
"nar_scale_factor": 1,
"nhead": 16,
"norm_first": true,
"num_decoder_layers": 12,
"num_quantizers": 8,
"prefix_mode": 1,
"prepend_bos": false,
"scaling_xformers": false,
"share_embedding": true,
},
"model_type": "VALLE",
"preprocess": {
"acoustic_token_dir": "acoutic_tokens",
"acoustic_token_extractor": "Encodec",
"align_mel_duration": false,
"audio_dir": "audios",
"bits": 8,
"contentvec_dir": "contentvec",
"data_augment": false,
"dur_dir": "durs",
"duration_dir": "duration",
"emo2id": "emo2id.json",
"energy_dir": "energys",
"energy_extract_mode": "from_mel",
"energy_norm": false,
"energy_remove_outlier": false,
"extract_acoustic_token": true,
"extract_audio": false,
"extract_contentvec_feature": false,
"extract_duration": false,
"extract_energy": false,
"extract_label": false,
"extract_linear_spec": false,
"extract_mcep": false,
"extract_mel": false,
"extract_mert_feature": false,
"extract_phoneme": true,
"extract_pitch": false,
"extract_uv": false,
"extract_wenet_feature": false,
"extract_whisper_feature": false,
"file_lst": "file.lst",
"fmax": 12000,
"fmin": 0,
"hop_size": 120,
"lab_dir": "labs",
"label_dir": "labels",
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
"linear_dir": "linears",
"max_duration": 14,
"mcep_dir": "mcep",
"mel_dir": "mels",
"mel_extract_mode": "",
"mel_min_max_norm": false,
"min_duration": 0.5,
"min_level_db": -115,
"n_fft": 1024,
"n_mel": 80,
"num_silent_frames": 8,
"phone_dir": "phones",
"phone_energy_dir": "phone_energys",
"phone_pitch_dir": "phone_pitches",
"phone_seq_file": "phone_seq_file",
"pitch_dir": "pitches",
"pitch_extractor": "parselmouth",
"pitch_norm": false,
"pitch_remove_outlier": false,
"processed_dir": "/mnt/data1/xueliumeng/processed_data_valle_libritts",
"raw_data": "raw_data",
"ref_level_db": 20,
"sample_rate": 24000,
"sampling_rate": "24000",
"spk2id": "spk2id.json",
"symbols_dict": "symbols.dict",
"text_extractor": "espeak",
"train_file": "train.json",
"trim_fft_size": 512,
"trim_hop_size": 128,
"trim_silence": false,
"trim_top_db": 30,
"trimmed_wav_dir": "trimmed_wavs",
"use_acoustic_token": true,
"use_audio": false,
"use_dur": false,
"use_emoid": false,
"use_frame_duration": false,
"use_frame_energy": false,
"use_frame_pitch": false,
"use_lab": false,
"use_label": false,
"use_linear": false,
"use_log_scale_energy": false,
"use_log_scale_pitch": false,
"use_mel": false,
"use_min_max_norm_mel": false,
"use_one_hot": false,
"use_phn_seq": false,
"use_phone": true,
"use_phone_duration": false,
"use_phone_energy": false,
"use_phone_pitch": false,
"use_spkid": false,
"use_text": false,
"use_uv": false,
"use_wav": false,
"use_wenet": false,
"utt2emo": "utt2emo",
"utt2spk": "utt2spk",
"uv_dir": "uvs",
"valid_file": "valid.json",
"wav_dir": "wavs",
"wenet_dir": "wenet",
"win_size": 480,
},
"supported_model_type": [
"GANVocoder",
"Fastspeech2",
"DiffSVC",
"Transformer",
"EDM",
"CD",
],
"train": {
"adamw": {
"lr": 0.0004,
},
"base_lr": 0.000125,
"batch_size": 5,
"dataloader": {
"num_worker": 32,
"pin_memory": true,
},
"ddp": false,
"gradient_accumulation_step": 1,
"keep_checkpoint_max": 5,
"keep_last": [
3,
-1,
],
"max_epoch": 100,
"max_sentences": 8,
"max_steps": 1000000,
"max_tokens": 3600,
"multi_speaker_training": false,
"optimizer": "AdamW",
"optimizer_name": "AdamW",
"random_seed": 10086,
"reducelronplateau": {
"factor": 0.8,
"min_lr": 0.0001,
"patience": 10,
},
"reset_interval": 200,
"run_eval": [
false,
true,
],
"sampler": {
"drop_last": true,
"holistic_shuffle": true,
},
"save_checkpoint_stride": [
1,
1,
],
"save_checkpoints_steps": 10000,
"save_summary_steps": 500,
"scheduler": "ReduceLROnPlateau",
"scheduler_name": "Cosine",
"start_epoch": 1,
"total_training_steps": 50000,
"tracker": [
"tensorboard",
],
"train_stage": 0,
"valid_interval": 1000,
"warmup_steps": 16000,
},
"use_custom_dataset": false,
} |