Spaces:
Build error
Build error
base_config: | |
- egs/egs_bases/tts/fs2.yaml | |
- egs/datasets/audio/emotion/base_text2mel.yaml | |
task_cls: modules.GenerSpeech.task.generspeech.GenerSpeechTask | |
# emotion encoder | |
emotion_encoder_path: checkpoints/Emotion_encoder.pt # set the emotion encoder path | |
# vocoder | |
vocoder: hifigan | |
vocoder_ckpt: checkpoints/trainset_hifigan | |
# dataset | |
raw_data_dir: 'data/raw/training_set' | |
processed_data_dir: 'data/processed/training_set' | |
binary_data_dir: 'data/binary/training_set' | |
test_input_dir: '' | |
# process | |
binarizer_cls: data_gen.tts.base_binarizer_emotion.EmotionBinarizer | |
audio_sample_rate: 16000 | |
hop_size: 256 # For 22050Hz, 275 ~= 12.5 ms (0.0125 * sample_rate) | |
win_size: 1024 # For 22050Hz, 1100 ~= 50 ms (If None, win_size: fft_size) (0.05 * sample_rate) | |
fmin: 80 # Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To test depending on dataset. Pitch info: male~[65, 260], female~[100, 525]) | |
fmax: 7600 # To be increased/reduced depending on data. | |
fft_size: 1024 # Extra window size is filled with 0 paddings to match this parameter | |
min_level_db: -100 | |
ref_level_db: 20 | |
binarization_args: | |
reset_phone_dict: true | |
reset_word_dict: true | |
shuffle: true | |
trim_eos_bos: false | |
trim_sil: false | |
with_align: true | |
with_f0: true | |
with_f0cwt: false | |
with_linear: false | |
with_spk_embed: true | |
with_spk_id: true | |
with_txt: true | |
with_wav: true | |
with_word: true | |
preprocess_cls: egs.datasets.audio.libritts.pre_align.LibrittsPreAlign | |
preprocess_args: | |
nsample_per_mfa_group: 1000 | |
# text process | |
txt_processor: en | |
use_mfa: true | |
with_phsep: true | |
reset_phone_dict: true | |
reset_word_dict: true | |
add_eos_bos: true | |
# mfa | |
mfa_group_shuffle: false | |
mfa_offset: 0.02 | |
# wav processors | |
wav_processors: [] | |
save_sil_mask: true | |
vad_max_silence_length: 12 | |
# data | |
word_dict_size: 10000 | |
num_spk: 500 | |
use_spk_embed: true | |
use_spk_id: false | |
use_word: true | |
use_emotion: true | |
use_gt_dur: false | |
ref_audio: '' | |
text: '' | |
# training | |
num_sanity_val_steps: -1 | |
max_updates: 300000 | |
max_sentences: 100000 | |
num_test_samples: 72 | |
## glow | |
post_glow_hidden: 128 | |
post_glow_kernel_size: 3 | |
post_glow_n_blocks: 8 | |
post_glow_n_block_layers: 3 | |
share_wn_layers: 4 | |
sigmoid_scale: false | |
post_share_cond_layers: false | |
use_txt_cond: true | |
use_latent_cond: true | |
noise_scale: 0.8 | |
# prosody extractor | |
lambda_commit: 0.25 | |
vq_start: 20500 | |
vae_dropout: 0.0 | |
nVQ: 128 | |
forcing: 20000 | |
crop: false | |
predictor_grad: 1.0 |