File size: 4,503 Bytes
629bfce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# ############################################################################
# Model: E2E ST JA->EN with Conformer
# Encoder: Conformer Encoder
# Decoder: Conformer Decoder + (CTC/ATT joint)
# Tokens: BPE
# losses: CTC
# Training: Custom JA->EN youtube scrape, ~600h
# Authors: Eric Engelhart, 2022
# ############################################################################
# Tokenier initialization
tokenizer: !new:sentencepiece.SentencePieceProcessor
# Features
sample_rate: 16000
n_fft: 400
n_mels: 80
# normalization
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
####################### Model parameters ###########################
# Transformer
d_model: 384
nhead: 6
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 1536
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000
attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA"
kernel_size: 15
encoder_module: conformer
# Outputs
blank_index: 0
label_smoothing: 0.1
pad_index: 0
bos_index: 1
eos_index: 2
unk_index: 0
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 2
valid_beam_size: 1
test_beam_size: 25
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 2
num_layers_per_block: 1
out_channels: (256, 256)
kernel_sizes: (3, 3)
strides: (2, 2)
residuals: (False, False)
Transformer: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
input_size: 5120
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
ctc_weight: 0
asr_weight: 0
mt_weight: 0
asr_tgt_vocab: !ref <output_neurons>
mt_src_vocab: !ref <output_neurons>
attention_type: !ref <attention_type>
kernel_size: !ref <kernel_size>
encoder_module: !ref <encoder_module>
normalize_before: True
causal: False
max_length: 5000
# only when multi-task setting is used
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
# when asr-weight > 0 and ctc-weight < 1
asr_seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <vocab_size>
st_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>]
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
modules: [!ref <Transformer>, !ref <seq_lin>, null]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
using_eos_threshold: True
length_normalization: True
ctc_weight: 0
lm_weight: 0
modules:
compute_features: !ref <compute_features>
normalizer: !ref <normalizer>
pre_transformer: !ref <CNN>
Transformer: !ref <Transformer>
asr_model: !ref <st_model>
encoder: !ref <encoder>
decoder: !ref <decoder>
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
tokenizer: !ref <tokenizer>
st: !ref <st_model>
normalizer: !ref <normalizer>
|