File size: 2,613 Bytes
c06b7e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
Loss:
  MelReconLoss:
    enable: true
    params: {loss_type: mae}
  ProsodyReconLoss:
    enable: true
    params: {loss_type: mae}
Model:
  KanTtsSAMBERT:
    optimizer:
      params:
        betas: [0.9, 0.98]
        eps: 1.0e-09
        lr: 0.001
        weight_decay: 0.0
      type: Adam
    params:
      MAS: false
      NSF: true
      SE: true
      decoder_attention_dropout: 0.1
      decoder_dropout: 0.1
      decoder_ffn_inner_dim: 1024
      decoder_num_heads: 8
      decoder_num_layers: 12
      decoder_num_units: 128
      decoder_prenet_units: [256, 256]
      decoder_relu_dropout: 0.1
      dur_pred_lstm_units: 128
      dur_pred_prenet_units: [128, 128]
      embedding_dim: 512
      emotion_units: 32
      encoder_attention_dropout: 0.1
      encoder_dropout: 0.1
      encoder_ffn_inner_dim: 1024
      encoder_num_heads: 8
      encoder_num_layers: 8
      encoder_num_units: 128
      encoder_projection_units: 32
      encoder_relu_dropout: 0.1
      max_len: 800
      nsf_f0_global_maximum: 730.0
      nsf_f0_global_minimum: 30.0
      nsf_norm_type: global
      num_mels: 82
      outputs_per_step: 3
      postnet_dropout: 0.1
      postnet_ffn_inner_dim: 512
      postnet_filter_size: 41
      postnet_fsmn_num_layers: 4
      postnet_lstm_units: 128
      postnet_num_memory_units: 256
      postnet_shift: 17
      predictor_dropout: 0.1
      predictor_ffn_inner_dim: 256
      predictor_filter_size: 41
      predictor_fsmn_num_layers: 3
      predictor_lstm_units: 128
      predictor_num_memory_units: 128
      predictor_shift: 0
      speaker_units: 192
    scheduler:
      params: {warmup_steps: 4000}
      type: NoamLR
allow_cache: false
audio_config: {fmax: 8000.0, fmin: 0.0, hop_length: 200, max_norm: 1.0, min_level_db: -100.0,
  n_fft: 2048, n_mels: 80, norm_type: mean_std, num_workers: 16, phone_level_feature: true,
  preemphasize: false, ref_level_db: 20, sampling_rate: 16000, symmetric: false, trim_silence: true,
  trim_silence_threshold_db: 60, wav_normalize: true, win_length: 1000}
batch_size: 32
create_time: '2023-07-08 01:06:41'
eval_interval_steps: 10000000000000000
git_revision_hash: d16755444c9baf23348213211a5ed9035458ecf0
grad_norm: 1.0
linguistic_unit: {cleaners: english_cleaners, lfeat_type_list: 'sy,tone,syllable_flag,word_segment,emo_category,speaker_category',
  speaker_list: F7}
log_interval: 10
log_interval_steps: 50
model_type: sambert
modelscope_version: 1.7.1
num_save_intermediate_results: 4
num_workers: 4
pin_memory: false
remove_short_samples: false
save_interval_steps: 500
train_max_steps: 2400502
train_steps: 502