File size: 1,219 Bytes
c89c07f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# ################################
# Model: wav2vec2
# Authors: Rudolf A. Braun 2022, Titouan Parcollet 2022
# ################################

sample_rate: 16000

# standard parameters for the BASE model
latent_extractor: !new:speechbrain.lobes.models.wav2vec.W2VLatentExtractor
   out_channels: [512, 512, 512, 512, 512, 512, 512]

# standard parameters for the BASE model
latent_encoder: !new:speechbrain.lobes.models.transformer.Transformer.TransformerEncoder
   d_model: 768
   num_layers: 12
   nhead: 8
   d_ffn: 3072
   dropout: 0.1
   layerdrop_prob: 0.0
   normalize_before: True
   activation: !name:torch.nn.GELU

# standard parameters for the BASE model
encoder_wrapper: !new:speechbrain.lobes.models.wav2vec.EncoderWrapper
   in_dim: 512
   embedding_dim: 768
   latent_encoder: !ref <latent_encoder>
   dropout_encoder_input: 0.1

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    latent_extractor: !ref <latent_extractor>
    encoder_wrapper: !ref <encoder_wrapper>

modules:
   encoder: !ref <encoder>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        latent_encoder: !ref <encoder_wrapper>
        latent_extractor: !ref <latent_extractor>