File size: 4,922 Bytes
7320fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ecb7464
7320fc0
 
 
 
 
 
 
 
f101e20
7320fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
f101e20
 
 
7320fc0
 
 
 
 
 
 
 
f101e20
7320fc0
f101e20
 
 
 
7320fc0
 
6454ad9
7320fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6454ad9
 
 
 
 
7320fc0
 
6454ad9
 
 
 
 
 
 
 
 
 
 
 
7320fc0
 
 
f101e20
7320fc0
 
6454ad9
 
7320fc0
 
 
 
 
 
 
 
 
 
 
 
6454ad9
7320fc0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6454ad9
7320fc0
 
 
 
 
 
 
6454ad9
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Conformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: KsponSpeech 965.2h
# Based on the works of: Jianyuan Zhong, Titouan Parcollet 2021
# Authors: Dongwon Kim, Dongwoo Kim 2021
# ############################################################################
# Seed needs to be set at top of yaml, before objects with parameters are made

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000

# Outputs
blank_index: 0
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
test_beam_size: 66
lm_weight: 0.60
ctc_weight_decode: 0.50

############################## models ################################

normalizer: !new:speechbrain.processing.features.InputNormalization
    norm_type: global

CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
    input_shape: (8, 10, 80)
    num_blocks: 3
    num_layers_per_block: 1
    out_channels: (64, 64, 64)
    kernel_sizes: (5, 5, 1)
    strides: (2, 2, 1)
    residuals: (False, False, True)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
    input_size: 1280
    tgt_vocab: !ref <output_neurons>
    d_model: !ref <d_model>
    nhead: !ref <nhead>
    num_encoder_layers: !ref <num_encoder_layers>
    num_decoder_layers: !ref <num_decoder_layers>
    d_ffn: !ref <d_ffn>
    dropout: !ref <transformer_dropout>
    activation: !ref <activation>
    encoder_module: conformer
    attention_type: RelPosMHAXL
    normalize_before: True
    causal: False

# NB: It has to match the pre-trained TransformerLM!!
lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
    vocab: !ref <output_neurons>
    d_model: 768
    nhead: 12
    num_encoder_layers: 12
    num_decoder_layers: 0
    d_ffn: 3072
    dropout: 0.0
    activation: !name:torch.nn.GELU
    normalize_before: False

tokenizer: !new:sentencepiece.SentencePieceProcessor

ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
   language_model: !ref <lm_model>
   temperature: 1.15
   
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
    eos_index: !ref <eos_index>
    blank_index: !ref <blank_index>
    ctc_fc: !ref <ctc_lin>
    
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
    full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
    weights:
        transformerlm: !ref <lm_weight>
        ctc: !ref <ctc_weight_decode>
        
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
    modules: [!ref <Transformer>, !ref <seq_lin>]
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>
    temperature: 1.15
    using_eos_threshold: False
    length_normalization: True
    scorer: !ref <scorer>


Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
    transformer: !ref <Transformer>

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    cnn: !ref <CNN>
    transformer_encoder: !ref <Tencoder>

asr_model: !new:torch.nn.ModuleList
    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]

log_softmax: !new:torch.nn.LogSoftmax
    dim: -1


compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>

modules:
   compute_features: !ref <compute_features>
   normalizer: !ref <normalizer>
   pre_transformer: !ref <CNN>
   transformer: !ref <Transformer>
   asr_model: !ref <asr_model>
   lm_model: !ref <lm_model>
   encoder: !ref <encoder>
   decoder: !ref <decoder>
   
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      normalizer: !ref <normalizer>
      asr: !ref <asr_model>
      lm: !ref <lm_model>
      tokenizer: !ref <tokenizer>