3.9 kB
# ################################ | |
# Model: VGG2 + LSTM + time pooling | |
# Augmentation: SpecAugment | |
# Authors: Titouan Parcollet, Mirco Ravanelli, Peter Plantinga, Ju-Chieh Chou, | |
# and Abdel HEBA 2020 | |
# ################################ | |
# Feature parameters (FBANKS etc) | |
sample_rate: 16000 | |
n_fft: 400 | |
n_mels: 80 | |
# Model parameters | |
activation: !name:torch.nn.LeakyReLU | |
dropout: 0.15 | |
cnn_blocks: 3 | |
cnn_channels: (128, 200, 256) | |
inter_layer_pooling_size: (2, 2, 2) | |
cnn_kernelsize: (3, 3) | |
time_pooling_size: 4 | |
rnn_class: !name:speechbrain.nnet.RNN.LSTM | |
rnn_layers: 5 | |
rnn_neurons: 1024 | |
rnn_bidirectional: True | |
dnn_blocks: 2 | |
dnn_neurons: 1024 | |
emb_size: 128 | |
dec_neurons: 1024 | |
# Outputs | |
output_neurons: 500 # BPE size, index(blank/eos/bos) = 0 | |
# Decoding parameters | |
# Be sure that the bos and eos index match with the BPEs ones | |
blank_index: 0 | |
bos_index: 0 | |
eos_index: 0 | |
min_decode_ratio: 0.0 | |
max_decode_ratio: 1.0 | |
beam_size: 80 | |
eos_threshold: 1.5 | |
using_max_attn_shift: True | |
max_attn_shift: 140 | |
ctc_weight_decode: 0.0 | |
temperature: 1.50 | |
normalizer: !new:speechbrain.processing.features.InputNormalization | |
norm_type: global | |
compute_features: !new:speechbrain.lobes.features.Fbank | |
sample_rate: !ref <sample_rate> | |
n_fft: !ref <n_fft> | |
n_mels: !ref <n_mels> | |
enc: !new:speechbrain.lobes.models.CRDNN.CRDNN | |
input_shape: [null, null, !ref <n_mels>] | |
activation: !ref <activation> | |
dropout: !ref <dropout> | |
cnn_blocks: !ref <cnn_blocks> | |
cnn_channels: !ref <cnn_channels> | |
cnn_kernelsize: !ref <cnn_kernelsize> | |
inter_layer_pooling_size: !ref <inter_layer_pooling_size> | |
time_pooling: True | |
using_2d_pooling: False | |
time_pooling_size: !ref <time_pooling_size> | |
rnn_class: !ref <rnn_class> | |
rnn_layers: !ref <rnn_layers> | |
rnn_neurons: !ref <rnn_neurons> | |
rnn_bidirectional: !ref <rnn_bidirectional> | |
rnn_re_init: True | |
dnn_blocks: !ref <dnn_blocks> | |
dnn_neurons: !ref <dnn_neurons> | |
emb: !new:speechbrain.nnet.embedding.Embedding | |
num_embeddings: !ref <output_neurons> | |
embedding_dim: !ref <emb_size> | |
dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder | |
enc_dim: !ref <dnn_neurons> | |
input_size: !ref <emb_size> | |
rnn_type: gru | |
attn_type: location | |
hidden_size: 1024 | |
attn_dim: 1024 | |
num_layers: 1 | |
scaling: 1.0 | |
channels: 10 | |
kernel_size: 100 | |
re_init: True | |
dropout: !ref <dropout> | |
ctc_lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref <dnn_neurons> | |
n_neurons: !ref <output_neurons> | |
seq_lin: !new:speechbrain.nnet.linear.Linear | |
input_size: !ref <dec_neurons> | |
n_neurons: !ref <output_neurons> | |
log_softmax: !new:speechbrain.nnet.activations.Softmax | |
apply_log: True | |
asr_model: !new:torch.nn.ModuleList | |
- [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>] | |
tokenizer: !new:sentencepiece.SentencePieceProcessor | |
# We compose the inference (encoder) pipeline. | |
encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential | |
input_shape: [null, null, !ref <n_mels>] | |
compute_features: !ref <compute_features> | |
normalize: !ref <normalizer> | |
model: !ref <enc> | |
decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher | |
embedding: !ref <emb> | |
decoder: !ref <dec> | |
linear: !ref <seq_lin> | |
bos_index: !ref <bos_index> | |
eos_index: !ref <eos_index> | |
min_decode_ratio: !ref <min_decode_ratio> | |
max_decode_ratio: !ref <max_decode_ratio> | |
beam_size: !ref <beam_size> | |
eos_threshold: !ref <eos_threshold> | |
using_max_attn_shift: !ref <using_max_attn_shift> | |
max_attn_shift: !ref <max_attn_shift> | |
temperature: !ref <temperature> | |
modules: | |
normalizer: !ref <normalizer> | |
encoder: !ref <encoder> | |
decoder: !ref <decoder> | |
pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer | |
loadables: | |
normalizer: !ref <normalizer> | |
asr: !ref <asr_model> | |
tokenizer: !ref <tokenizer> | |