vits_ljspeech / args.json
lmxue's picture
VITS checkpoint trained on LJSpeech
cb5cf70
{
"base_config": "config/vits.json",
"dataset": [
"LJSpeech",
],
"model": {
"text_token_num": 151,
"filter_channels": 768,
"gin_channels": 0,
"hidden_channels": 192,
"inter_channels": 192,
"kernel_size": 3,
"n_heads": 2,
"n_layers": 6,
"n_layers_q": 3,
"n_speakers": 0,
"p_dropout": 0.1,
"resblock": "1",
"resblock_dilation_sizes": [
[
1,
3,
5,
],
[
1,
3,
5,
],
[
1,
3,
5,
],
],
"resblock_kernel_sizes": [
3,
7,
11,
],
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
16,
16,
4,
4,
],
"upsample_rates": [
8,
8,
2,
2,
],
"use_sdp": true,
"use_spectral_norm": false,
},
"model_type": "VITS",
"preprocess": {
"audio_dir": "audios",
"bits": 8,
"contentvec_dir": "contentvec",
"data_augment": false,
"dur_dir": "durs",
"duration_dir": "duration",
"emo2id": "emo2id.json",
"extract_phone": true,
"phone_extractor": "lexicon",
"lexicon_path": "./text/lexicon/librispeech-lexicon.txt",
"energy_dir": "energys",
"energy_extract_mode": "from_mel",
"energy_norm": false,
"extract_audio": true,
"extract_contentvec_feature": false,
"extract_duration": false,
"extract_energy": false,
"extract_label": false,
"extract_linear_spec": true,
"extract_mcep": false,
"extract_mel": true,
"extract_mert_feature": false,
"extract_pitch": false,
"extract_uv": false,
"extract_wenet_feature": false,
"extract_whisper_feature": false,
"file_lst": "file.lst",
"fmax": null,
"fmin": 0,
"hop_size": 256,
"lab_dir": "labs",
"label_dir": "labels",
"linear_dir": "linears",
"mcep_dir": "mcep",
"mel_dir": "mels",
"mel_min_max_norm": false,
"min_level_db": -115,
"n_fft": 1024,
"n_mel": 80,
"num_silent_frames": 8,
"phone_seq_file": "phone_seq_file",
"pitch_dir": "pitches",
"pitch_extractor": "parselmouth",
"pitch_norm": false,
"processed_dir": "/mnt/workspace/xueliumeng/data/ljspeech/processed_data_vits_accelerate",
"ref_level_db": 20,
"sample_rate": 22050,
"segment_size": 8192,
"spk2id": "spk2id.json",
"text_cleaners": [
"english_cleaners",
],
"train_file": "train.json",
"trim_fft_size": 512,
"trim_hop_size": 128,
"trim_silence": false,
"trim_top_db": 30,
"trimmed_wav_dir": "trimmed_wavs",
"use_audio": true,
"use_dur": false,
"use_emoid": false,
"use_frame_duration": false,
"use_frame_energy": false,
"use_frame_pitch": false,
"use_lab": false,
"use_label": false,
"use_linear": true,
"use_log_scale_energy": false,
"use_log_scale_pitch": false,
"use_mel": true,
"use_min_max_norm_mel": false,
"use_one_hot": false,
"use_phn_seq": false,
"use_phone": true,
"use_phone_duration": false,
"use_phone_energy": false,
"use_phone_pitch": false,
"use_spkid": false,
"use_text": false,
"use_uv": false,
"use_wav": false,
"use_wenet": false,
"utt2emo": "utt2emo",
"utt2spk": "utt2spk",
"uv_dir": "uvs",
"valid_file": "test.json",
"wav_dir": "wavs",
"wenet_dir": "wenet",
"win_size": 1024,
},
"supported_model_type": [
"GANVocoder",
"Fastspeech2",
"DiffSVC",
"Transformer",
"EDM",
"CD",
],
"train": {
"AdamW": {
"betas": [
0.8,
0.99,
],
"eps": 1e-09,
},
"batch_size": 16,
"betas": [
0.8,
0.99,
],
"c_kl": 1.0,
"c_mel": 45,
"dataloader": {
"num_worker": 32,
"pin_memory": true,
},
"ddp": false,
"epochs": 50000,
"eps": 1e-09,
"fp16_run": true,
"gradient_accumulation_step": 1,
"init_lr_ratio": 1,
"keep_checkpoint_max": 5,
"keep_last": [
3,
-1,
],
"learning_rate": 0.0002,
"lr_decay": 0.999875,
"max_epoch": -1,
"max_steps": 1000000,
"multi_speaker_training": false,
"random_seed": 970227,
"run_eval": [
false,
true,
],
"sampler": {
"drop_last": true,
"holistic_shuffle": true,
},
"save_checkpoint_stride": [
5,
20,
],
"save_checkpoints_steps": 10000,
"save_summary_steps": 500,
"total_training_steps": 50000,
"tracker": [
"tensorboard",
],
"valid_interval": 10000,
"warmup_epochs": 0,
},
"use_custom_dataset": false,
}