hifigan_ljspeech / args.json
lmxue's picture
Update checkpoint
7e31db6 verified
{
"dataset_path": {
"LJSpeech": "/home/datasets/LJSpeech-1.1",
},
"base_config": "config/base.json",
"dataset": [
"LJSpeech",
],
"preprocess": {
"trim_silence": false,
"num_silent_frames": 8,
"trim_fft_size": 512,
"trim_hop_size": 128,
"trim_top_db": 30,
"extract_mel": true,
"extract_mcep": false,
"extract_pitch": true,
"extract_uv": true,
"pitch_norm": false,
"extract_audio": true,
"extract_label": false,
"pitch_extractor": "parselmouth",
"extract_energy": false,
"energy_norm": false,
"energy_extract_mode": "from_mel",
"extract_duration": false,
"mel_min_max_norm": false,
"mu_law_norm": false,
"extract_whisper_feature": false,
"extract_contentvec_feature": false,
"extract_mert_feature": false,
"extract_wenet_feature": false,
"n_mel": 80,
"win_size": 1024,
"hop_size": 256,
"sample_rate": 22050,
"n_fft": 1024,
"fmin": 0,
"fmax": 8000,
"min_level_db": -115,
"ref_level_db": 20,
"bits": 8,
"processed_dir": "processed_data",
"trimmed_wav_dir": "trimmed_wavs",
"wav_dir": "wavs",
"audio_dir": "audios",
"label_dir": "labels",
"mel_dir": "mels",
"mcep_dir": "mcep",
"dur_dir": "durs",
"lab_dir": "labs",
"wenet_dir": "wenet",
"contentvec_dir": "contentvec",
"pitch_dir": "pitches",
"energy_dir": "energys",
"uv_dir": "uvs",
"duration_dir": "duration",
"phone_seq_file": "phone_seq_file",
"file_lst": "file.lst",
"train_file": "train.json",
"valid_file": "test.json",
"spk2id": "spk2id.json",
"utt2spk": "utt2spk",
"emo2id": "emo2id.json",
"utt2emo": "utt2emo",
"use_phn_seq": false,
"use_lab": false,
"use_mel": true,
"use_wav": false,
"use_phone_pitch": false,
"use_log_scale_pitch": false,
"use_phone_energy": false,
"use_phone_duration": false,
"use_log_scale_energy": false,
"use_wenet": false,
"use_dur": false,
"use_spkid": false,
"use_emoid": false,
"use_frame_pitch": false,
"use_uv": true,
"use_frame_energy": false,
"use_frame_duration": false,
"use_audio": true,
"use_label": false,
"use_one_hot": false,
"data_augment": false,
"align_mel_duration": false,
"f0_min": 50,
"f0_max": 1100,
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"cut_mel_frame": 32,
"use_min_max_norm_mel": false,
},
"train": {
"ddp": false,
"random_seed": 970227,
"batch_size": 16,
"epochs": 50000,
"max_steps": 1000000,
"total_training_steps": 50000,
"save_summary_steps": 500,
"save_checkpoints_steps": 10000,
"valid_interval": 10000,
"keep_checkpoint_max": 15,
"multi_speaker_training": false,
"adamw": {
"lr": 0.0002,
"adam_b1": 0.8,
"adam_b2": 0.99,
},
"exponential_lr": {
"lr_decay": 0.999,
},
"criterions": [
"feature",
"discriminator",
"generator",
"mel",
"wav",
],
},
"model_type": "GANVocoder",
"model": {
"generator": "hifigan",
"discriminators": [
"msd",
"mpd",
"msstftd",
"mscqtd",
],
"hifigan": {
"resblock": "2",
"upsample_rates": [
8,
8,
4,
],
"upsample_kernel_sizes": [
16,
16,
8,
],
"upsample_initial_channel": 256,
"resblock_kernel_sizes": [
3,
5,
7,
],
"resblock_dilation_sizes": [
[
1,
2,
],
[
2,
6,
],
[
3,
12,
],
],
},
"mpd": {
"mpd_reshapes": [
2,
3,
5,
7,
11,
],
"use_spectral_norm": false,
"discriminator_channel_multi": 1,
},
"msstftd": {
"filters": 32,
},
"mscqtd": {
"hop_lengths": [
512,
256,
256,
],
"filters": 32,
"max_filters": 1024,
"filters_scale": 1,
"dilations": [
1,
2,
4,
],
"in_channels": 1,
"out_channels": 1,
"n_octaves": [
9,
9,
9,
],
"bins_per_octaves": [
24,
36,
48,
],
},
},
"exp_name": "hifigan",
}