styletts2-HF / config.json
not-lain's picture
Push model using huggingface_hub.
45908a9 verified
{
"ASR_config": {
"batch_size": 64,
"dataset_params": {
"data_augmentation": false
},
"device": "cuda",
"epochs": 180,
"log_dir": "logs/20201006",
"model_params": {
"hidden_dim": 256,
"input_dim": 80,
"n_token": 178,
"token_embedding_dim": 512
},
"optimizer_params": {
"lr": 0.0005
},
"preprocess_parasm": {
"mel_params": {
"n_mels": 80
},
"spect_params": {
"hop_length": 300,
"n_fft": 2048,
"win_length": 1200
},
"sr": 24000
},
"pretrained_model": "",
"save_freq": 5,
"train_data": "ASRDataset/train_list.txt",
"val_data": "ASRDataset/val_list.txt"
},
"BERT_CONFIG": {
"batch_size": 32,
"data_folder": "wikipedia_20220301.en.processed",
"dataset_params": {
"max_mel_length": 512,
"phoneme_mask_prob": 0.1,
"replace_prob": 0.2,
"token_maps": "token_maps.pkl",
"token_mask": "M",
"token_separator": " ",
"tokenizer": "bert-base-multilingual-cased",
"word_mask_prob": 0.15,
"word_separator": 102
},
"log_dir": "Checkpoint_all_phonemes",
"log_interval": 10,
"mixed_precision": "fp16",
"model_params": {
"dropout": 0.1,
"hidden_size": 768,
"intermediate_size": 2048,
"max_position_embeddings": 512,
"num_attention_heads": 12,
"num_hidden_layers": 12,
"vocab_size": 178
},
"num_process": 1,
"num_steps": 2000000,
"save_interval": 20000
},
"LIBRI_TTS_CONFIG": {
"ASR_config": "Utils/ASR/config.yml",
"ASR_path": "Utils/ASR/epoch_00080.pth",
"F0_path": "Utils/JDC/bst.t7",
"PLBERT_dir": "Utils/PLBERT/",
"batch_size": 8,
"data_params": {
"OOD_data": "Data/OOD_texts.txt",
"min_length": 50,
"root_path": "",
"train_data": "Data/train_list.txt",
"val_data": "Data/val_list.txt"
},
"device": "cuda",
"epochs_1st": 40,
"epochs_2nd": 25,
"first_stage_path": "first_stage.pth",
"load_only_params": false,
"log_dir": "Models/LibriTTS",
"log_interval": 10,
"loss_params": {
"TMA_epoch": 4,
"diff_epoch": 0,
"joint_epoch": 0,
"lambda_F0": 1.0,
"lambda_ce": 20.0,
"lambda_diff": 1.0,
"lambda_dur": 1.0,
"lambda_gen": 1.0,
"lambda_mel": 5.0,
"lambda_mono": 1.0,
"lambda_norm": 1.0,
"lambda_s2s": 1.0,
"lambda_slm": 1.0,
"lambda_sty": 1.0
},
"max_len": 300,
"model_params": {
"decoder": {
"resblock_dilation_sizes": [
[
1,
3,
5
],
[
1,
3,
5
],
[
1,
3,
5
]
],
"resblock_kernel_sizes": [
3,
7,
11
],
"type": "hifigan",
"upsample_initial_channel": 512,
"upsample_kernel_sizes": [
20,
10,
6,
4
],
"upsample_rates": [
10,
5,
3,
2
]
},
"diffusion": {
"dist": {
"estimate_sigma_data": true,
"mean": -3.0,
"sigma_data": 0.19926648961191362,
"std": 1.0
},
"embedding_mask_proba": 0.1,
"transformer": {
"head_features": 64,
"multiplier": 2,
"num_heads": 8,
"num_layers": 3
}
},
"dim_in": 64,
"dropout": 0.2,
"hidden_dim": 512,
"max_conv_dim": 512,
"max_dur": 50,
"multispeaker": true,
"n_layer": 3,
"n_mels": 80,
"n_token": 178,
"slm": {
"hidden": 768,
"initial_channel": 64,
"model": "microsoft/wavlm-base-plus",
"nlayers": 13,
"sr": 16000
},
"style_dim": 128
},
"optimizer_params": {
"bert_lr": 1e-05,
"ft_lr": 1e-05,
"lr": 0.0001
},
"preprocess_params": {
"spect_params": {
"hop_length": 300,
"n_fft": 2048,
"win_length": 1200
},
"sr": 24000
},
"pretrained_model": "Models/LibriTTS/epoch_2nd_00002.pth",
"save_freq": 1,
"second_stage_load_pretrained": true,
"slmadv_params": {
"batch_percentage": 0.5,
"iter": 20,
"max_len": 500,
"min_len": 400,
"scale": 0.01,
"sig": 1.5,
"thresh": 5
}
},
"config_path": null,
"model_checkpoint_path": null,
"phoneme_converter": "gruut"
}