{ "dataset_path": { "LJSpeech": "/home/datasets/LJSpeech-1.1", }, "base_config": "config/base.json", "dataset": [ "LJSpeech", ], "preprocess": { "trim_silence": false, "num_silent_frames": 8, "trim_fft_size": 512, "trim_hop_size": 128, "trim_top_db": 30, "extract_mel": true, "extract_mcep": false, "extract_pitch": true, "extract_uv": true, "pitch_norm": false, "extract_audio": true, "extract_label": false, "pitch_extractor": "parselmouth", "extract_energy": false, "energy_norm": false, "energy_extract_mode": "from_mel", "extract_duration": false, "mel_min_max_norm": false, "mu_law_norm": false, "extract_whisper_feature": false, "extract_contentvec_feature": false, "extract_mert_feature": false, "extract_wenet_feature": false, "n_mel": 80, "win_size": 1024, "hop_size": 256, "sample_rate": 22050, "n_fft": 1024, "fmin": 0, "fmax": 8000, "min_level_db": -115, "ref_level_db": 20, "bits": 8, "processed_dir": "processed_data", "trimmed_wav_dir": "trimmed_wavs", "wav_dir": "wavs", "audio_dir": "audios", "label_dir": "labels", "mel_dir": "mels", "mcep_dir": "mcep", "dur_dir": "durs", "lab_dir": "labs", "wenet_dir": "wenet", "contentvec_dir": "contentvec", "pitch_dir": "pitches", "energy_dir": "energys", "uv_dir": "uvs", "duration_dir": "duration", "phone_seq_file": "phone_seq_file", "file_lst": "file.lst", "train_file": "train.json", "valid_file": "test.json", "spk2id": "spk2id.json", "utt2spk": "utt2spk", "emo2id": "emo2id.json", "utt2emo": "utt2emo", "use_phn_seq": false, "use_lab": false, "use_mel": true, "use_wav": false, "use_phone_pitch": false, "use_log_scale_pitch": false, "use_phone_energy": false, "use_phone_duration": false, "use_log_scale_energy": false, "use_wenet": false, "use_dur": false, "use_spkid": false, "use_emoid": false, "use_frame_pitch": false, "use_uv": true, "use_frame_energy": false, "use_frame_duration": false, "use_audio": true, "use_label": false, "use_one_hot": false, "data_augment": false, "align_mel_duration": false, "f0_min": 50, "f0_max": 1100, "pitch_bin": 256, "pitch_max": 1100.0, "pitch_min": 50.0, "cut_mel_frame": 32, "use_min_max_norm_mel": false, }, "train": { "ddp": false, "random_seed": 970227, "batch_size": 16, "epochs": 50000, "max_steps": 1000000, "total_training_steps": 50000, "save_summary_steps": 500, "save_checkpoints_steps": 10000, "valid_interval": 10000, "keep_checkpoint_max": 15, "multi_speaker_training": false, "adamw": { "lr": 0.0002, "adam_b1": 0.8, "adam_b2": 0.99, }, "exponential_lr": { "lr_decay": 0.999, }, "criterions": [ "feature", "discriminator", "generator", "mel", "wav", ], }, "model_type": "GANVocoder", "model": { "generator": "hifigan", "discriminators": [ "msd", "mpd", "msstftd", "mscqtd", ], "hifigan": { "resblock": "2", "upsample_rates": [ 8, 8, 4, ], "upsample_kernel_sizes": [ 16, 16, 8, ], "upsample_initial_channel": 256, "resblock_kernel_sizes": [ 3, 5, 7, ], "resblock_dilation_sizes": [ [ 1, 2, ], [ 2, 6, ], [ 3, 12, ], ], }, "mpd": { "mpd_reshapes": [ 2, 3, 5, 7, 11, ], "use_spectral_norm": false, "discriminator_channel_multi": 1, }, "msstftd": { "filters": 32, }, "mscqtd": { "hop_lengths": [ 512, 256, 256, ], "filters": 32, "max_filters": 1024, "filters_scale": 1, "dilations": [ 1, 2, 4, ], "in_channels": 1, "out_channels": 1, "n_octaves": [ 9, 9, 9, ], "bins_per_octaves": [ 24, 36, 48, ], }, }, "exp_name": "hifigan", }