{ "base_config": "config/vits.json", "dataset": [ "LJSpeech", ], "model": { "text_token_num": 151, "filter_channels": 768, "gin_channels": 0, "hidden_channels": 192, "inter_channels": 192, "kernel_size": 3, "n_heads": 2, "n_layers": 6, "n_layers_q": 3, "n_speakers": 0, "p_dropout": 0.1, "resblock": "1", "resblock_dilation_sizes": [ [ 1, 3, 5, ], [ 1, 3, 5, ], [ 1, 3, 5, ], ], "resblock_kernel_sizes": [ 3, 7, 11, ], "upsample_initial_channel": 512, "upsample_kernel_sizes": [ 16, 16, 4, 4, ], "upsample_rates": [ 8, 8, 2, 2, ], "use_sdp": true, "use_spectral_norm": false, }, "model_type": "VITS", "preprocess": { "audio_dir": "audios", "bits": 8, "contentvec_dir": "contentvec", "data_augment": false, "dur_dir": "durs", "duration_dir": "duration", "emo2id": "emo2id.json", "extract_phone": true, "phone_extractor": "lexicon", "lexicon_path": "./text/lexicon/librispeech-lexicon.txt", "energy_dir": "energys", "energy_extract_mode": "from_mel", "energy_norm": false, "extract_audio": true, "extract_contentvec_feature": false, "extract_duration": false, "extract_energy": false, "extract_label": false, "extract_linear_spec": true, "extract_mcep": false, "extract_mel": true, "extract_mert_feature": false, "extract_pitch": false, "extract_uv": false, "extract_wenet_feature": false, "extract_whisper_feature": false, "file_lst": "file.lst", "fmax": null, "fmin": 0, "hop_size": 256, "lab_dir": "labs", "label_dir": "labels", "linear_dir": "linears", "mcep_dir": "mcep", "mel_dir": "mels", "mel_min_max_norm": false, "min_level_db": -115, "n_fft": 1024, "n_mel": 80, "num_silent_frames": 8, "phone_seq_file": "phone_seq_file", "pitch_dir": "pitches", "pitch_extractor": "parselmouth", "pitch_norm": false, "processed_dir": "/mnt/workspace/xueliumeng/data/ljspeech/processed_data_vits_accelerate", "ref_level_db": 20, "sample_rate": 22050, "segment_size": 8192, "spk2id": "spk2id.json", "text_cleaners": [ "english_cleaners", ], "train_file": "train.json", "trim_fft_size": 512, "trim_hop_size": 128, "trim_silence": false, "trim_top_db": 30, "trimmed_wav_dir": "trimmed_wavs", "use_audio": true, "use_dur": false, "use_emoid": false, "use_frame_duration": false, "use_frame_energy": false, "use_frame_pitch": false, "use_lab": false, "use_label": false, "use_linear": true, "use_log_scale_energy": false, "use_log_scale_pitch": false, "use_mel": true, "use_min_max_norm_mel": false, "use_one_hot": false, "use_phn_seq": false, "use_phone": true, "use_phone_duration": false, "use_phone_energy": false, "use_phone_pitch": false, "use_spkid": false, "use_text": false, "use_uv": false, "use_wav": false, "use_wenet": false, "utt2emo": "utt2emo", "utt2spk": "utt2spk", "uv_dir": "uvs", "valid_file": "test.json", "wav_dir": "wavs", "wenet_dir": "wenet", "win_size": 1024, }, "supported_model_type": [ "GANVocoder", "Fastspeech2", "DiffSVC", "Transformer", "EDM", "CD", ], "train": { "AdamW": { "betas": [ 0.8, 0.99, ], "eps": 1e-09, }, "batch_size": 16, "betas": [ 0.8, 0.99, ], "c_kl": 1.0, "c_mel": 45, "dataloader": { "num_worker": 32, "pin_memory": true, }, "ddp": false, "epochs": 50000, "eps": 1e-09, "fp16_run": true, "gradient_accumulation_step": 1, "init_lr_ratio": 1, "keep_checkpoint_max": 5, "keep_last": [ 3, -1, ], "learning_rate": 0.0002, "lr_decay": 0.999875, "max_epoch": -1, "max_steps": 1000000, "multi_speaker_training": false, "random_seed": 970227, "run_eval": [ false, true, ], "sampler": { "drop_last": true, "holistic_shuffle": true, }, "save_checkpoint_stride": [ 5, 20, ], "save_checkpoints_steps": 10000, "save_summary_steps": 500, "total_training_steps": 50000, "tracker": [ "tensorboard", ], "valid_interval": 10000, "warmup_epochs": 0, }, "use_custom_dataset": false, }