File size: 2,482 Bytes
f79a1ca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# STFT arguments
sample_rate: 16000
n_fft: 512
win_length: 32
hop_length: 16

# Enhancement model args
emb_channels: 1024
emb_kernel_size: 3
emb_padding: same
enhancer_size: 512
enhancer_layers: 8
enhancer_heads: 8
enhancer_causal: False
enhancer_drop_rate: 0.1

compute_stft: !new:speechbrain.processing.features.STFT
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    win_length: !ref <win_length>
    hop_length: !ref <hop_length>

compute_istft: !new:speechbrain.processing.features.ISTFT
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    win_length: !ref <win_length>
    hop_length: !ref <hop_length>

spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude
    power: 0.5

resynth: !name:speechbrain.processing.signal_processing.resynthesize
    stft: !ref <compute_stft>
    istft: !ref <compute_istft>

enhance_model: !new:speechbrain.lobes.models.transformer.TransformerSE.CNNTransformerSE
    output_size: !ref <n_fft> // 2 + 1
    d_model: !ref <n_fft> // 2
    output_activation: !name:torch.nn.ReLU
    activation: !name:torch.nn.LeakyReLU
    dropout: !ref <enhancer_drop_rate>
    num_layers: !ref <enhancer_layers>
    d_ffn: !ref <enhancer_size>
    nhead: !ref <enhancer_heads>
    causal: !ref <enhancer_causal>
    custom_emb_module: !new:speechbrain.nnet.containers.Sequential
        input_shape: [null, null, !ref <n_fft> // 2 + 1]
        conv1: !name:speechbrain.nnet.CNN.Conv1d
            out_channels: !ref <emb_channels>
            kernel_size: 3
        norm1: !name:speechbrain.nnet.normalization.LayerNorm
        act1: !new:torch.nn.LeakyReLU
        conv2: !name:speechbrain.nnet.CNN.Conv1d
            out_channels: !ref <emb_channels> // 2
            kernel_size: 3
        norm2: !name:speechbrain.nnet.normalization.LayerNorm
        act2: !new:torch.nn.LeakyReLU
        conv3: !name:speechbrain.nnet.CNN.Conv1d
            out_channels: !ref <emb_channels> // 4
            kernel_size: 3
        norm3: !name:speechbrain.nnet.normalization.LayerNorm
        act3: !new:torch.nn.LeakyReLU
        conv4: !name:speechbrain.nnet.CNN.Conv1d
            out_channels: !ref <emb_channels> // 4
            kernel_size: 3
        norm4: !name:speechbrain.nnet.normalization.LayerNorm
        act4: !new:torch.nn.LeakyReLU

modules:
    enhance_model: !ref <enhance_model>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        enhance_model: !ref <enhance_model>