streaming: True # Decoding parameters # Be sure that the bos and eos index match with the BPEs ones # Decoding parameters blank_index: 0 bos_index: 0 eos_index: 0 pad_index: 0 beam_size: 20 nbest: 1 # by default {state,expand}_beam = 2.3 as mention in paper # https://arxiv.org/abs/1904.02619 state_beam: 2.5 expand_beam: 2.5 lm_weight: 0.1 sample_rate: 16000 n_fft: 512 n_mels: 80 win_length: 32 # Transformer d_model: 256 joint_dim: 640 nhead: 4 num_encoder_layers: 12 num_decoder_layers: 0 d_ffn: 2048 activation: !name:torch.nn.GELU output_neurons: 5000 dec_dim: 512 normalizer: !new:speechbrain.processing.features.InputNormalization norm_type: global compute_features: !new:speechbrain.lobes.features.Fbank sample_rate: !ref n_fft: !ref n_mels: !ref win_length: !ref CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd input_shape: (8, 10, 80) num_blocks: 2 num_layers_per_block: 1 out_channels: (64, 32) kernel_sizes: (3, 3) strides: (2, 2) residuals: (False, False) transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length input_size: 640 tgt_vocab: !ref d_model: !ref nhead: !ref num_encoder_layers: !ref num_decoder_layers: !ref d_ffn: !ref activation: !ref encoder_module: conformer attention_type: RelPosMHAXL normalize_before: True causal: False # We must call an encoder wrapper so the decoder isn't run (we don't have any) enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper transformer: !ref # For MTL CTC over the encoder proj_ctc: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref # Define some projection layers to make sure that enc and dec # output dim are the same before joining proj_enc: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False proj_dec: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref consider_as_one_hot: True blank_id: !ref dec: !new:speechbrain.nnet.RNN.LSTM input_shape: [null, null, !ref - 1] hidden_size: !ref num_layers: 1 re_init: True Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint joint: sum # joint [sum | concat] nonlinearity: !ref transducer_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref bias: False asr_model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref , !ref , !ref , !ref , !ref ] Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher decode_network_lst: !new:torch.nn.ModuleList - [!ref , !ref , !ref ] tjoint: !ref classifier_network: !new:torch.nn.ModuleList - [!ref ] blank_id: !ref beam_size: !ref nbest: !ref lm_module: !ref lm_weight: !ref state_beam: !ref expand_beam: !ref Greedysearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher decode_network_lst: [!ref , !ref , !ref ] tjoint: !ref classifier_network: [!ref ] blank_id: !ref beam_size: 1 nbest: 1 lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM output_neurons: !ref embedding_dim: 256 activation: !name:torch.nn.LeakyReLU dropout: 0.0 rnn_layers: 6 rnn_neurons: 512 dnn_blocks: 1 dnn_neurons: 256 return_hidden: True # For inference modules: CNN: !ref enc: !ref emb: !ref dec: !ref Tjoint: !ref transducer_lin: !ref normalize: !ref proj_ctc: !ref proj_dec: !ref proj_enc: !ref tokenizer: !new:sentencepiece.SentencePieceProcessor # We compose the inference (encoder) pipeline. encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential input_shape: [null, null] compute_features: !ref normalize: !ref cnn: !ref transformer_encoder: !ref proj_enc: !ref pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: asr: !ref lm: !ref tokenizer: !ref # inference stuff make_tokenizer_streaming_context: !name:speechbrain.tokenizers.SentencePiece.SentencePieceDecoderStreamingContext tokenizer_decode_streaming: !name:speechbrain.tokenizers.SentencePiece.spm_decode_preserve_leading_space make_decoder_streaming_context: !name:speechbrain.decoders.transducer.TransducerGreedySearcherStreamingContext # default constructor decoding_function: !name:speechbrain.decoders.transducer.TransducerBeamSearcher.transducer_greedy_decode_streaming - !ref # self fea_streaming_extractor: !new:speechbrain.lobes.features.StreamingFeatureWrapper module: !new:speechbrain.nnet.containers.LengthsCapableSequential - !ref - !ref - !ref # don't consider normalization as part of the input filter chain. # normalization will operate at chunk level, which mismatches training # somewhat, but does not appear to result in noticeable degradation. properties: !apply:speechbrain.utils.filter_analysis.stack_filter_properties - [!ref , !ref ]