Spaces:
Running
on
T4
Running
on
T4
File size: 3,406 Bytes
b725c5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 |
{
"base_config": "config/tts.json",
"model_type": "FastSpeech2",
"task_type": "tts",
"dataset": ["LJSpeech"],
"preprocess": {
// acoustic features
"extract_audio": true,
"extract_mel": true,
"mel_extract_mode": "taco",
"mel_min_max_norm": false,
"extract_pitch": true,
"extract_uv": false,
"pitch_extractor": "dio",
"extract_energy": true,
"energy_extract_mode": "from_tacotron_stft",
"extract_duration": true,
"use_phone": true,
"pitch_norm": true,
"energy_norm": true,
"pitch_remove_outlier": true,
"energy_remove_outlier": true,
// Default config
"n_mel": 80,
"win_size": 1024, // todo
"hop_size": 256,
"sample_rate": 22050,
"n_fft": 1024, // todo
"fmin": 0,
"fmax": 8000, // todo
"raw_data": "raw_data",
"text_cleaners": ["english_cleaners"],
"f0_min": 71, // ~C2
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"is_label": true,
"is_mu_law": true,
"bits": 8,
"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"content_vector_dir": "content_vector",
"wenet_dir": "wenet",
"mert_dir": "mert",
"spk2id":"spk2id.json",
"utt2spk":"utt2spk",
// Features used for model training
"use_mel": true,
"use_min_max_norm_mel": false,
"use_frame_pitch": false,
"use_frame_energy": false,
"use_phone_pitch": true,
"use_phone_energy": true,
"use_log_scale_pitch": false,
"use_log_scale_energy": false,
"use_spkid": false,
"align_mel_duration": true,
"text_cleaners": ["english_cleaners"],
"phone_extractor": "lexicon", // "espeak, pypinyin, pypinyin_initials_finals, lexicon (only for language=en-us right now)"
},
"model": {
// Settings for transformer
"transformer": {
"encoder_layer": 4,
"encoder_head": 2,
"encoder_hidden": 256,
"decoder_layer": 6,
"decoder_head": 2,
"decoder_hidden": 256,
"conv_filter_size": 1024,
"conv_kernel_size": [9, 1],
"encoder_dropout": 0.2,
"decoder_dropout": 0.2
},
// Settings for variance_predictor
"variance_predictor":{
"filter_size": 256,
"kernel_size": 3,
"dropout": 0.5
},
"variance_embedding":{
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
"n_bins": 256
},
"max_seq_len": 1000
},
"train":{
"batch_size": 16,
"sort_sample": true,
"drop_last": true,
"group_size": 4,
"grad_clip_thresh": 1.0,
"dataloader": {
"num_worker": 8,
"pin_memory": true
},
"lr_scheduler":{
"num_warmup": 4000
},
// LR Scheduler
"scheduler": "NoamLR",
// Optimizer
"optimizer": "Adam",
"adam": {
"lr": 0.0625,
"betas": [0.9, 0.98],
"eps": 0.000000001,
"weight_decay": 0.0
},
}
}
|