RMSnow's picture
add backend inference and inferface output
0883aa1
raw
history blame
3.28 kB
{
"base_config": "config/tts.json",
"model_type": "FastSpeech2",
"task_type": "tts",
"dataset": ["LJSpeech"],
"preprocess": {
// acoustic features
"extract_audio": true,
"extract_mel": true,
"mel_extract_mode": "taco",
"mel_min_max_norm": false,
"extract_pitch": true,
"extract_uv": false,
"pitch_extractor": "dio",
"extract_energy": true,
"energy_extract_mode": "from_tacotron_stft",
"extract_duration": true,
"use_phone": true,
"pitch_norm": true,
"energy_norm": true,
"pitch_remove_outlier": true,
"energy_remove_outlier": true,
// Default config
"n_mel": 80,
"win_size": 1024, // todo
"hop_size": 256,
"sample_rate": 22050,
"n_fft": 1024, // todo
"fmin": 0,
"fmax": 8000, // todo
"raw_data": "raw_data",
"text_cleaners": ["english_cleaners"],
"f0_min": 71, // ~C2
"f0_max": 800, //1100, // ~C6(1100), ~G5(800)
"pitch_bin": 256,
"pitch_max": 1100.0,
"pitch_min": 50.0,
"is_label": true,
"is_mu_law": true,
"bits": 8,
"mel_min_max_stats_dir": "mel_min_max_stats",
"whisper_dir": "whisper",
"content_vector_dir": "content_vector",
"wenet_dir": "wenet",
"mert_dir": "mert",
"spk2id":"spk2id.json",
"utt2spk":"utt2spk",
// Features used for model training
"use_mel": true,
"use_min_max_norm_mel": false,
"use_frame_pitch": false,
"use_frame_energy": false,
"use_phone_pitch": true,
"use_phone_energy": true,
"use_log_scale_pitch": false,
"use_log_scale_energy": false,
"use_spkid": false,
"align_mel_duration": true,
"text_cleaners": ["english_cleaners"]
},
"model": {
// Settings for transformer
"transformer": {
"encoder_layer": 4,
"encoder_head": 2,
"encoder_hidden": 256,
"decoder_layer": 6,
"decoder_head": 2,
"decoder_hidden": 256,
"conv_filter_size": 1024,
"conv_kernel_size": [9, 1],
"encoder_dropout": 0.2,
"decoder_dropout": 0.2
},
// Settings for variance_predictor
"variance_predictor":{
"filter_size": 256,
"kernel_size": 3,
"dropout": 0.5
},
"variance_embedding":{
"pitch_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the pitch values are not normalized during preprocessing
"energy_quantization": "linear", // support 'linear' or 'log', 'log' is allowed only if the energy values are not normalized during preprocessing
"n_bins": 256
},
"max_seq_len": 1000
},
"train":{
"batch_size": 16,
"sort_sample": true,
"drop_last": true,
"group_size": 4,
"grad_clip_thresh": 1.0,
"dataloader": {
"num_worker": 8,
"pin_memory": true
},
"lr_scheduler":{
"num_warmup": 4000
},
// LR Scheduler
"scheduler": "NoamLR",
// Optimizer
"optimizer": "Adam",
"adam": {
"lr": 0.0625,
"betas": [0.9, 0.98],
"eps": 0.000000001,
"weight_decay": 0.0
},
}
}