File size: 6,275 Bytes
2b7bf83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# This is the configuration file for JSUT dataset.
# This configuration is based on StyleMelGAN paper but
# uses MSE loss instead of Hinge loss. And I found that
# batch_size = 8 is also working good. So maybe if you
# want to accelerate the training, you can reduce the
# batch size (e.g. 8 or 16). Upsampling scales is modified
# to fit the shift size 300 pt.

###########################################################
#                FEATURE EXTRACTION SETTING               #
###########################################################
sampling_rate: 24000     # Sampling rate.
fft_size: 2048           # FFT size.
hop_size: 300            # Hop size.
win_length: 1200         # Window length.
                         # If set to null, it will be the same as fft_size.
window: "hann"           # Window function.
num_mels: 80             # Number of mel basis.
fmin: 80                 # Minimum freq in mel basis calculation.
fmax: 7600               # Maximum frequency in mel basis calculation.
global_gain_scale: 1.0   # Will be multiplied to all of waveform.
trim_silence: false      # Whether to trim the start and end of silence.
trim_threshold_in_db: 60 # Need to tune carefully if the recording is not good.
trim_frame_size: 1024    # Frame size in trimming.
trim_hop_size: 256       # Hop size in trimming.
format: "hdf5"           # Feature file format. " npy " or " hdf5 " is supported.

###########################################################
#         GENERATOR NETWORK ARCHITECTURE SETTING          #
###########################################################
generator_type: "StyleMelGANGenerator" # Generator type.
generator_params:
    in_channels: 128
    aux_channels: 80
    channels: 64
    out_channels: 1
    kernel_size: 9
    dilation: 2
    bias: True
    noise_upsample_scales: [10, 2, 2, 2]
    noise_upsample_activation: "LeakyReLU"
    noise_upsample_activation_params:
        negative_slope: 0.2
    upsample_scales: [5, 1, 5, 1, 3, 1, 2, 2, 1]
    upsample_mode: "nearest"
    gated_function: "softmax"
    use_weight_norm: True

###########################################################
#       DISCRIMINATOR NETWORK ARCHITECTURE SETTING        #
###########################################################
discriminator_type: "StyleMelGANDiscriminator" # Discriminator type.
discriminator_params:
    repeats: 4
    window_sizes: [512, 1024, 2048, 4096]
    pqmf_params:
        - [1, None, None, None]
        - [2, 62, 0.26700, 9.0]
        - [4, 62, 0.14200, 9.0]
        - [8, 62, 0.07949, 9.0]
    discriminator_params:
        out_channels: 1
        kernel_sizes: [5, 3]
        channels: 16
        max_downsample_channels: 512
        bias: True
        downsample_scales: [4, 4, 4, 1]
        nonlinear_activation: "LeakyReLU"
        nonlinear_activation_params:
            negative_slope: 0.2
    use_weight_norm: True

###########################################################
#                   STFT LOSS SETTING                     #
###########################################################
stft_loss_params:
    fft_sizes: [1024, 2048, 512]  # List of FFT size for STFT-based loss.
    hop_sizes: [120, 240, 50]     # List of hop size for STFT-based loss
    win_lengths: [600, 1200, 240] # List of window length for STFT-based loss.
    window: "hann_window"         # Window function for STFT-based loss
lambda_aux: 1.0                   # Loss balancing coefficient for aux loss.

###########################################################
#               ADVERSARIAL LOSS SETTING                  #
###########################################################
lambda_adv: 1.0 # Loss balancing coefficient for adv loss.
generator_adv_loss_params:
    average_by_discriminators: false # Whether to average loss by #discriminators.
discriminator_adv_loss_params:
    average_by_discriminators: false # Whether to average loss by #discriminators.

###########################################################
#                  DATA LOADER SETTING                    #
###########################################################
batch_size: 32              # Batch size.
batch_max_steps: 24000      # Length of each audio in batch. Make sure dividable by hop_size.
pin_memory: true            # Whether to pin memory in Pytorch DataLoader.
num_workers: 2              # Number of workers in Pytorch DataLoader.
remove_short_samples: false # Whether to remove samples the length of which are less than batch_max_steps.
allow_cache: true           # Whether to allow cache in dataset. If true, it requires cpu memory.

###########################################################
#             OPTIMIZER & SCHEDULER SETTING               #
###########################################################
generator_optimizer_type: Adam
generator_optimizer_params:
    lr: 1.0e-4
    betas: [0.5, 0.9]
    weight_decay: 0.0
generator_scheduler_type: MultiStepLR
generator_scheduler_params:
    gamma: 0.5
    milestones:
        - 100000
        - 300000
        - 500000
        - 700000
        - 900000
generator_grad_norm: -1
discriminator_optimizer_type: Adam
discriminator_optimizer_params:
    lr: 2.0e-4
    betas: [0.5, 0.9]
    weight_decay: 0.0
discriminator_scheduler_type: MultiStepLR
discriminator_scheduler_params:
    gamma: 0.5
    milestones:
        - 200000
        - 400000
        - 600000
        - 800000
discriminator_grad_norm: -1

###########################################################
#                    INTERVAL SETTING                     #
###########################################################
discriminator_train_start_steps: 100000 # Number of steps to start to train discriminator.
train_max_steps: 1500000                # Number of training steps.
save_interval_steps: 50000              # Interval steps to save checkpoint.
eval_interval_steps: 1000              # Interval steps to evaluate the network.
log_interval_steps: 100                # Interval steps to record the training log.

###########################################################
#                     OTHER SETTING                       #
###########################################################
num_save_intermediate_results: 4  # Number of results to be saved as intermediate results.