File size: 4,503 Bytes
629bfce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# ############################################################################
# Model: E2E ST JA->EN with Conformer
# Encoder: Conformer Encoder
# Decoder: Conformer Decoder + (CTC/ATT joint)
# Tokens: BPE
# losses: CTC
# Training: Custom JA->EN youtube scrape, ~600h
# Authors:  Eric Engelhart, 2022
# ############################################################################

# Tokenier initialization
tokenizer: !new:sentencepiece.SentencePieceProcessor

# Features
sample_rate: 16000
n_fft: 400
n_mels: 80

# normalization
normalizer: !new:speechbrain.processing.features.InputNormalization
    norm_type: global

compute_features: !new:speechbrain.lobes.features.Fbank
    sample_rate: !ref <sample_rate>
    n_fft: !ref <n_fft>
    n_mels: !ref <n_mels>

####################### Model parameters ###########################
# Transformer
d_model: 384
nhead: 6
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 1536
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000
attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA"
kernel_size: 15
encoder_module: conformer


# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0

# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 2
valid_beam_size: 1
test_beam_size: 25

############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
    input_shape: (8, 10, 80)
    num_blocks: 2
    num_layers_per_block: 1
    out_channels: (256, 256)
    kernel_sizes: (3, 3)
    strides: (2, 2)
    residuals: (False, False)

Transformer: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
    input_size: 5120
    tgt_vocab: !ref <output_neurons>
    d_model: !ref <d_model>
    nhead: !ref <nhead>
    num_encoder_layers: !ref <num_encoder_layers>
    num_decoder_layers: !ref <num_decoder_layers>
    d_ffn: !ref <d_ffn>
    dropout: !ref <transformer_dropout>
    activation: !ref <activation>
    ctc_weight: 0
    asr_weight: 0
    mt_weight: 0
    asr_tgt_vocab: !ref <output_neurons>
    mt_src_vocab: !ref <output_neurons>
    attention_type: !ref <attention_type>
    kernel_size: !ref <kernel_size>
    encoder_module: !ref <encoder_module>
    normalize_before: True
    causal: False
    max_length: 5000

# only when multi-task setting is used
ctc_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <output_neurons>

# when asr-weight > 0 and ctc-weight < 1
asr_seq_lin: !new:speechbrain.nnet.linear.Linear
    input_size: !ref <d_model>
    n_neurons: !ref <vocab_size>


st_model: !new:torch.nn.ModuleList
    - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>]


Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
    transformer: !ref <Transformer>

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    input_shape: [null, null, !ref <n_mels>]
    compute_features: !ref <compute_features>
    normalize: !ref <normalizer>
    cnn: !ref <CNN>
    transformer_encoder: !ref <Tencoder>



decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
    modules: [!ref <Transformer>, !ref <seq_lin>, null]
    bos_index: !ref <bos_index>
    eos_index: !ref <eos_index>
    blank_index: !ref <blank_index>
    min_decode_ratio: !ref <min_decode_ratio>
    max_decode_ratio: !ref <max_decode_ratio>
    beam_size: !ref <test_beam_size>
    using_eos_threshold: True
    length_normalization: True
    ctc_weight: 0
    lm_weight: 0


modules:
    compute_features: !ref <compute_features>
    normalizer: !ref <normalizer>
    pre_transformer: !ref <CNN>
    Transformer: !ref <Transformer>
    asr_model: !ref <st_model>
    encoder: !ref <encoder>
    decoder: !ref <decoder>


log_softmax: !new:torch.nn.LogSoftmax
    dim: -1



# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
    loadables:
        tokenizer: !ref <tokenizer>
        st: !ref <st_model>
        normalizer: !ref <normalizer>