|
{ |
|
"ASR_config": { |
|
"batch_size": 64, |
|
"dataset_params": { |
|
"data_augmentation": false |
|
}, |
|
"device": "cuda", |
|
"epochs": 180, |
|
"log_dir": "logs/20201006", |
|
"model_params": { |
|
"hidden_dim": 256, |
|
"input_dim": 80, |
|
"n_token": 178, |
|
"token_embedding_dim": 512 |
|
}, |
|
"optimizer_params": { |
|
"lr": 0.0005 |
|
}, |
|
"preprocess_parasm": { |
|
"mel_params": { |
|
"n_mels": 80 |
|
}, |
|
"spect_params": { |
|
"hop_length": 300, |
|
"n_fft": 2048, |
|
"win_length": 1200 |
|
}, |
|
"sr": 24000 |
|
}, |
|
"pretrained_model": "", |
|
"save_freq": 5, |
|
"train_data": "ASRDataset/train_list.txt", |
|
"val_data": "ASRDataset/val_list.txt" |
|
}, |
|
"BERT_CONFIG": { |
|
"batch_size": 32, |
|
"data_folder": "wikipedia_20220301.en.processed", |
|
"dataset_params": { |
|
"max_mel_length": 512, |
|
"phoneme_mask_prob": 0.1, |
|
"replace_prob": 0.2, |
|
"token_maps": "token_maps.pkl", |
|
"token_mask": "M", |
|
"token_separator": " ", |
|
"tokenizer": "bert-base-multilingual-cased", |
|
"word_mask_prob": 0.15, |
|
"word_separator": 102 |
|
}, |
|
"log_dir": "Checkpoint_all_phonemes", |
|
"log_interval": 10, |
|
"mixed_precision": "fp16", |
|
"model_params": { |
|
"dropout": 0.1, |
|
"hidden_size": 768, |
|
"intermediate_size": 2048, |
|
"max_position_embeddings": 512, |
|
"num_attention_heads": 12, |
|
"num_hidden_layers": 12, |
|
"vocab_size": 178 |
|
}, |
|
"num_process": 1, |
|
"num_steps": 2000000, |
|
"save_interval": 20000 |
|
}, |
|
"LIBRI_TTS_CONFIG": { |
|
"ASR_config": "Utils/ASR/config.yml", |
|
"ASR_path": "Utils/ASR/epoch_00080.pth", |
|
"F0_path": "Utils/JDC/bst.t7", |
|
"PLBERT_dir": "Utils/PLBERT/", |
|
"batch_size": 8, |
|
"data_params": { |
|
"OOD_data": "Data/OOD_texts.txt", |
|
"min_length": 50, |
|
"root_path": "", |
|
"train_data": "Data/train_list.txt", |
|
"val_data": "Data/val_list.txt" |
|
}, |
|
"device": "cuda", |
|
"epochs_1st": 40, |
|
"epochs_2nd": 25, |
|
"first_stage_path": "first_stage.pth", |
|
"load_only_params": false, |
|
"log_dir": "Models/LibriTTS", |
|
"log_interval": 10, |
|
"loss_params": { |
|
"TMA_epoch": 4, |
|
"diff_epoch": 0, |
|
"joint_epoch": 0, |
|
"lambda_F0": 1.0, |
|
"lambda_ce": 20.0, |
|
"lambda_diff": 1.0, |
|
"lambda_dur": 1.0, |
|
"lambda_gen": 1.0, |
|
"lambda_mel": 5.0, |
|
"lambda_mono": 1.0, |
|
"lambda_norm": 1.0, |
|
"lambda_s2s": 1.0, |
|
"lambda_slm": 1.0, |
|
"lambda_sty": 1.0 |
|
}, |
|
"max_len": 300, |
|
"model_params": { |
|
"decoder": { |
|
"resblock_dilation_sizes": [ |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
], |
|
[ |
|
1, |
|
3, |
|
5 |
|
] |
|
], |
|
"resblock_kernel_sizes": [ |
|
3, |
|
7, |
|
11 |
|
], |
|
"type": "hifigan", |
|
"upsample_initial_channel": 512, |
|
"upsample_kernel_sizes": [ |
|
20, |
|
10, |
|
6, |
|
4 |
|
], |
|
"upsample_rates": [ |
|
10, |
|
5, |
|
3, |
|
2 |
|
] |
|
}, |
|
"diffusion": { |
|
"dist": { |
|
"estimate_sigma_data": true, |
|
"mean": -3.0, |
|
"sigma_data": 0.19926648961191362, |
|
"std": 1.0 |
|
}, |
|
"embedding_mask_proba": 0.1, |
|
"transformer": { |
|
"head_features": 64, |
|
"multiplier": 2, |
|
"num_heads": 8, |
|
"num_layers": 3 |
|
} |
|
}, |
|
"dim_in": 64, |
|
"dropout": 0.2, |
|
"hidden_dim": 512, |
|
"max_conv_dim": 512, |
|
"max_dur": 50, |
|
"multispeaker": true, |
|
"n_layer": 3, |
|
"n_mels": 80, |
|
"n_token": 178, |
|
"slm": { |
|
"hidden": 768, |
|
"initial_channel": 64, |
|
"model": "microsoft/wavlm-base-plus", |
|
"nlayers": 13, |
|
"sr": 16000 |
|
}, |
|
"style_dim": 128 |
|
}, |
|
"optimizer_params": { |
|
"bert_lr": 1e-05, |
|
"ft_lr": 1e-05, |
|
"lr": 0.0001 |
|
}, |
|
"preprocess_params": { |
|
"spect_params": { |
|
"hop_length": 300, |
|
"n_fft": 2048, |
|
"win_length": 1200 |
|
}, |
|
"sr": 24000 |
|
}, |
|
"pretrained_model": "Models/LibriTTS/epoch_2nd_00002.pth", |
|
"save_freq": 1, |
|
"second_stage_load_pretrained": true, |
|
"slmadv_params": { |
|
"batch_percentage": 0.5, |
|
"iter": 20, |
|
"max_len": 500, |
|
"min_len": 400, |
|
"scale": 0.01, |
|
"sig": 1.5, |
|
"thresh": 5 |
|
} |
|
}, |
|
"config_path": null, |
|
"model_checkpoint_path": null, |
|
"phoneme_converter": "gruut" |
|
} |