File size: 4,991 Bytes
e8b63e8 916c9ff e8b63e8 916c9ff 3fec198 e8b63e8 3fec198 e8b63e8 916c9ff e8b63e8 916c9ff e8b63e8 916c9ff e8b63e8 916c9ff e8b63e8 916c9ff e8b63e8 916c9ff e8b63e8 2857446 bcf56f7 2857446 bcf56f7 56c7c0e 2857446 bcf56f7 2857446 56c7c0e 2857446 e8b63e8 457789e e8b63e8 9e5941e 457789e 9e5941e 0a69b53 da3c41d e8b63e8 da3c41d e8b63e8 2340515 bf677fa e8b63e8 9e5941e e8b63e8 457789e e8b63e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
# ############################################################################
# Model: E2E ASR with Transformer
# Encoder: Transformer Encoder
# Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
# Tokens: unigram
# losses: CTC + KLdiv (Label Smoothing loss)
# Training: Librispeech 960h
# Authors: Jianyuan Zhong, Titouan Parcollet 2021
# ############################################################################
# Feature parameters
sample_rate: 16000
n_fft: 400
n_mels: 80
####################### Model parameters ###########################
# Transformer
d_model: 512
nhead: 4
num_encoder_layers: 12
num_decoder_layers: 6
d_ffn: 2048
transformer_dropout: 0.1
activation: !name:torch.nn.GELU
output_neurons: 5000
vocab_size: 5000
# Outputs
blank_index: 0
label_smoothing: 0.0
pad_index: 0
bos_index: 1
eos_index: 2
# Decoding parameters
min_decode_ratio: 0.0
max_decode_ratio: 1.0
valid_search_interval: 10
valid_beam_size: 10
test_beam_size: 66
lm_weight: 0.60
ctc_weight_decode: 0.40
############################## models ################################
CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
input_shape: (8, 10, 80)
num_blocks: 3
num_layers_per_block: 1
out_channels: (64, 64, 64)
kernel_sizes: (5, 5, 1)
strides: (2, 2, 1)
residuals: (False, False, True)
norm: !name:speechbrain.nnet.normalization.LayerNorm
Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
input_size: 1280
tgt_vocab: !ref <output_neurons>
d_model: !ref <d_model>
nhead: !ref <nhead>
num_encoder_layers: !ref <num_encoder_layers>
num_decoder_layers: !ref <num_decoder_layers>
d_ffn: !ref <d_ffn>
dropout: !ref <transformer_dropout>
activation: !ref <activation>
encoder_module: transformer
attention_type: regularMHA
normalize_before: True
causal: False
ctc_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
seq_lin: !new:speechbrain.nnet.linear.Linear
input_size: !ref <d_model>
n_neurons: !ref <output_neurons>
# Scorer
ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
eos_index: !ref <eos_index>
blank_index: !ref <blank_index>
ctc_fc: !ref <ctc_lin>
transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
language_model: !ref <lm_model>
temperature: 1.15
scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
full_scorers: [!ref <transformerlm_scorer>,
!ref <ctc_scorer>]
weights:
transformerlm: !ref <lm_weight>
ctc: !ref <ctc_weight_decode>
decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
modules: [!ref <Transformer>, !ref <seq_lin>]
bos_index: !ref <bos_index>
eos_index: !ref <eos_index>
min_decode_ratio: !ref <min_decode_ratio>
max_decode_ratio: !ref <max_decode_ratio>
beam_size: !ref <test_beam_size>
temperature: 1.15
using_eos_threshold: False
length_normalization: True
log_softmax: !new:torch.nn.LogSoftmax
dim: -1
normalizer: !new:speechbrain.processing.features.InputNormalization
norm_type: global
compute_features: !new:speechbrain.lobes.features.Fbank
sample_rate: !ref <sample_rate>
n_fft: !ref <n_fft>
n_mels: !ref <n_mels>
# This is the Transformer LM that is used according to the Huggingface repository
# Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
# For more details about the model!
# NB: It has to match the pre-trained TransformerLM!!
lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
vocab: 5000
d_model: 768
nhead: 12
num_encoder_layers: 12
num_decoder_layers: 0
d_ffn: 3072
dropout: 0.0
activation: !name:torch.nn.GELU
normalize_before: False
tokenizer: !new:sentencepiece.SentencePieceProcessor
Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
transformer: !ref <Transformer>
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
input_shape: [null, null, !ref <n_mels>]
compute_features: !ref <compute_features>
normalize: !ref <normalizer>
cnn: !ref <CNN>
transformer_encoder: !ref <Tencoder>
# Models
asr_model: !new:torch.nn.ModuleList
- [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
modules:
compute_features: !ref <compute_features>
normalizer: !ref <normalizer>
pre_transformer: !ref <CNN>
transformer: !ref <Transformer>
asr_model: !ref <asr_model>
lm_model: !ref <lm_model>
encoder: !ref <encoder>
decoder: !ref <decoder>
# The pretrainer allows a mapping between pretrained files and instances that
# are declared in the yaml.
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
loadables:
normalizer: !ref <normalizer>
asr: !ref <asr_model>
lm: !ref <lm_model>
tokenizer: !ref <tokenizer>
|