# ############################################################################ # Model: Direct SLU # Encoder: Pre-trained ASR encoder -> LSTM # Decoder: GRU + beamsearch # Tokens: BPE with unigram # losses: NLL # Training: Timers and Such # Authors: Loren Lugosch, Mirco Ravanelli 2020 # ############################################################################ # Model parameters sample_rate: 16000 emb_size: 128 dec_neurons: 512 output_neurons: 51 # index(eos/bos) = 0 ASR_encoder_dim: 512 encoder_dim: 256 # Decoding parameters bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 slu_beam_size: 80 eos_threshold: 1.5 temperature: 1.25 # Models asr_model_source: speechbrain/asr-crdnn-rnnlm-librispeech slu_enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref ] lstm: !new:speechbrain.nnet.RNN.LSTM input_size: !ref bidirectional: True hidden_size: !ref num_layers: 2 linear: !new:speechbrain.nnet.linear.Linear input_size: !ref * 2 n_neurons: !ref output_emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref input_size: !ref rnn_type: gru attn_type: keyvalue hidden_size: !ref attn_dim: 512 num_layers: 3 scaling: 1.0 dropout: 0.0 seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref ] tokenizer: !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref tokenizer: !ref beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref temperature: !ref using_max_attn_shift: False max_attn_shift: 30 modules: slu_enc: !ref beam_searcher: !ref