File size: 4,012 Bytes
a87b222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d73b65f
 
a87b222
 
 
 
 
 
 
 
 
f6d184b
a87b222
 
 
 
 
 
 
 
 
 
 
7bacef7
a87b222
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9de803c
 
a87b222
9de803c
 
 
 
 
f6d184b
9de803c
 
a87b222
0aedced
a87b222
 
0aedced
 
 
 
 
 
 
 
 
 
 
a87b222
 
 
 
 
0aedced
a87b222
9de803c
f6d184b
9de803c
 
31a6fb1
9de803c
a87b222
 
 
 
 
f6d184b
a87b222
0aedced
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch
# Tokens: BPE with unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: AISHELL-1
# Authors:  Jianyuan Zhong, Titouan Parcollet
# ############################################################################

# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80

####################### Model parameters ###########################
# Transformer
d_model: 256
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000

# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0 # 1.0
valid_search_interval: 10
valid_beam_size: 10
test_beam_size: 10
ctc_weight_decode: 0.40

############################## models ################################

compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>

normalizer: !new:speechbrain.processing.features.InputNormalization
    norm_type: global


CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
    input_shape: (8, 10, 80)
    num_blocks: 2
    num_layers_per_block: 1
    out_channels: (256, 256)
    kernel_sizes: (3, 3)
    strides: (2, 2)
    residuals: (False, False)
    norm: !name:speechbrain.nnet.normalization.BatchNorm2d

Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
    input_size: 5120
    tgt_vocab: !ref <output_neurons>
    d_model: !ref <d_model>
    nhead: !ref <nhead>
    num_encoder_layers: !ref <num_encoder_layers>
    num_decoder_layers: !ref <num_decoder_layers>
    d_ffn: !ref <d_ffn>
    dropout: !ref <transformer_dropout>
    activation: !ref <activation>
    normalize_before: True

ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

tokenizer: !new:sentencepiece.SentencePieceProcessor

asr_model: !new:torch.nn.ModuleList
    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]

# Here, we extract the encoder from the Transformer model
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
    transformer: !ref <Transformer>

# We compose the inference (encoder) pipeline.
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    cnn: !ref <CNN>
    transformer_encoder: !ref <Tencoder>

ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
    eos_index: !ref <eos_index>
    blank_index: !ref <blank_index>
    ctc_fc: !ref <ctc_lin>
    
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
    full_scorers: [!ref <ctc_scorer>]
    weights:
        ctc: !ref <ctc_weight_decode>
        
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
    modules: [!ref <Transformer>, !ref <seq_lin>]
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>
    using_eos_threshold: False
    length_normalization: True
    scorer: !ref <scorer>

modules:
    normalizer: !ref <normalizer>
    encoder: !ref <encoder>
    decoder: !ref <decoder>
    ctc_lin: !ref <ctc_lin>

log_softmax: !new:torch.nn.LogSoftmax
    dim: -1

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        normalizer: !ref <normalizer>
        asr: !ref <asr_model>
        tokenizer: !ref <tokenizer>