# ############################################################################ # Model: Direct SLU # Encoder: Pre-trained ASR encoder -> LSTM # Decoder: GRU + beamsearch # Tokens: BPE with unigram # losses: NLL # Training: Timers and Such # Authors: Loren Lugosch, Mirco Ravanelli 2020 # ############################################################################ token_type: unigram # ["unigram", "bpe", "char"] # Model parameters sample_rate: 16000 emb_size: 128 dec_neurons: 512 output_neurons: 51 # index(eos/bos) = 0 ASR_encoder_dim: 512 encoder_dim: 256 # Decoding parameters bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 slu_beam_size: 80 eos_threshold: 1.5 temperature: 1.25 # Models asr_model: !apply:speechbrain.pretrained.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech run_opts: {"device":"cuda:0"} slu_enc: !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, !ref ] lstm: !new:speechbrain.nnet.RNN.LSTM input_size: !ref bidirectional: True hidden_size: !ref num_layers: 2 linear: !new:speechbrain.nnet.linear.Linear input_size: !ref * 2 n_neurons: !ref output_emb: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref input_size: !ref rnn_type: gru attn_type: keyvalue hidden_size: !ref attn_dim: 512 num_layers: 3 scaling: 1.0 dropout: 0.0 seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref env_corrupt: !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: !ref babble_prob: 0.0 reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 modules: slu_enc: !ref output_emb: !ref dec: !ref seq_lin: !ref env_corrupt: !ref model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref ] tokenizer: !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer collect_in: !ref /TAS_tokenizer loadables: tokenizer: !ref paths: tokenizer: !ref beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref temperature: !ref using_max_attn_shift: False max_attn_shift: 30 coverage_penalty: 0. opt_class: !name:torch.optim.Adam lr: !ref lr_annealing: !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: !ref improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: !ref recoverables: model: !ref scheduler: !ref counter: !ref augmentation: !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: !ref speeds: [95, 100, 105] log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True seq_cost: !name:speechbrain.nnet.losses.nll_loss label_smoothing: 0.1 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: !ref error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: True