Datasculptor's picture
Duplicate from AIGC-Audio/AudioGPT
98f685a
raw
history blame
3 kB
accumulate_grad_batches: 1
audio_num_mel_bins: 80
audio_sample_rate: 24000
base_config:
- configs/tts/lj/fs2.yaml
binarization_args:
shuffle: false
with_align: true
with_f0: true
with_f0cwt: true
with_spk_embed: true
with_txt: true
with_wav: false
binarizer_cls: data_gen.tts.base_binarizer.BaseBinarizer
binary_data_dir: data/binary/xiaoma1022_24k_128hop
check_val_every_n_epoch: 10
clip_grad_norm: 1
cwt_add_f0_loss: false
cwt_hidden_size: 128
cwt_layers: 2
cwt_loss: l1
cwt_std_scale: 0.8
debug: false
dec_ffn_kernel_size: 9
dec_layers: 4
decoder_type: fft
dict_dir: ''
dropout: 0.1
ds_workers: 4
dur_enc_hidden_stride_kernel:
- 0,2,3
- 0,2,3
- 0,1,3
dur_loss: mse
dur_predictor_kernel: 3
dur_predictor_layers: 2
enc_ffn_kernel_size: 9
enc_layers: 4
encoder_K: 8
encoder_type: fft
endless_ds: true
ffn_act: gelu
ffn_padding: SAME
fft_size: 512
fmax: 12000
fmin: 30
gen_dir_name: ''
hidden_size: 256
hop_size: 128
infer: false
lambda_commit: 0.25
lambda_energy: 0.1
lambda_f0: 1.0
lambda_ph_dur: 1.0
lambda_sent_dur: 1.0
lambda_uv: 1.0
lambda_word_dur: 1.0
load_ckpt: ''
log_interval: 100
loud_norm: false
lr: 2.0
max_epochs: 1000
max_eval_sentences: 1
max_eval_tokens: 60000
max_frames: 5000
max_input_tokens: 1550
max_sentences: 100000
max_tokens: 20000
max_updates: 60000
mel_loss: l1
mel_vmax: 1.5
mel_vmin: -6
min_level_db: -120
norm_type: gn
num_ckpt_keep: 3
num_heads: 2
num_sanity_val_steps: 5
num_spk: 1
num_test_samples: 20
num_valid_plots: 10
optimizer_adam_beta1: 0.9
optimizer_adam_beta2: 0.98
out_wav_norm: false
pitch_ar: false
pitch_enc_hidden_stride_kernel:
- 0,2,5
- 0,2,5
- 0,2,5
pitch_extractor_conv_layers: 2
pitch_loss: l1
pitch_norm: log
pitch_type: frame
pre_align_args:
allow_no_txt: false
denoise: false
forced_align: mfa
txt_processor: en
use_sox: false
use_tone: true
pre_align_cls: data_gen.tts.lj.pre_align.LJPreAlign
predictor_dropout: 0.5
predictor_grad: 0.1
predictor_hidden: -1
predictor_kernel: 5
predictor_layers: 2
prenet_dropout: 0.5
prenet_hidden_size: 256
pretrain_fs_ckpt: ''
processed_data_dir: data/processed/ljspeech
profile_infer: false
raw_data_dir: data/raw/LJSpeech-1.1
ref_norm_layer: bn
reset_phone_dict: true
save_best: false
save_ckpt: true
save_codes:
- configs
- modules
- tasks
- utils
- usr
save_f0: false
save_gt: false
seed: 1234
sort_by_len: true
stop_token_weight: 5.0
task_cls: tasks.tts.pe.PitchExtractionTask
test_ids:
- 68
- 70
- 74
- 87
- 110
- 172
- 190
- 215
- 231
- 294
- 316
- 324
- 402
- 422
- 485
- 500
- 505
- 508
- 509
- 519
test_input_dir: ''
test_num: 523
test_set_name: test
train_set_name: train
use_denoise: false
use_energy_embed: false
use_gt_dur: false
use_gt_f0: false
use_pitch_embed: true
use_pos_embed: true
use_spk_embed: false
use_spk_id: false
use_split_spk_id: false
use_uv: true
use_var_enc: false
val_check_interval: 2000
valid_num: 348
valid_set_name: valid
vocoder: pwg
vocoder_ckpt: ''
warmup_updates: 2000
weight_decay: 0
win_size: 512
work_dir: checkpoints/0102_xiaoma_pe