# Hparams NEEDED HPARAMS_NEEDED: ["wav2vec_output_dim", "emb_size", "dec_neurons", "dec_layers", "output_neurons", "log_softmax", "tokenizer"] # Modules Needed MODULES_NEEDED: ["encoder_w2v2", "embedding", "ctc_lin", "seq_lin", "lm_model"] # Pretrain folder (HuggingFace) output_folder: !ref output_folder_seq2seq_cv_podcast_arhiv_augmentation pretrained_path: Macedonian-ASR/wav2vec2-aed-macedonian-asr # wav2vec2_hub: facebook/wav2vec2-large-xlsr-53 wav2vec2_hub: jonatasgrosman/wav2vec2-large-xlsr-53-russian save_folder: !ref /save wav2vec2_folder: !ref /wav2vec2_checkpoint ####################### Training Parameters #################################### ####################### Model Parameters ####################################### dropout: 0.15 wav2vec_output_dim: 1024 emb_size: 128 dec_neurons: 1024 dec_layers: 1 output_neurons: 1000 blank_index: 0 bos_index: 1 eos_index: 2 unk_index: 0 # Decoding parameters min_decode_ratio: 0.0 max_decode_ratio: 1.0 valid_beam_size: 10 test_beam_size: 20 using_eos_threshold: True eos_threshold: 1.5 using_max_attn_shift: False max_attn_shift: 700 length_normalization: True temperature: 1.0 temperature_lm: 1.4 # Scoring parameters coverage_penalty: 1.5 lm_weight: 0.2 # This is the RNNLM that is used according to the Huggingface repository # NB: It has to match the pre-trained RNNLM!! lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM output_neurons: !ref embedding_dim: !ref activation: !name:torch.nn.LeakyReLU dropout: 0.0 rnn_layers: 3 rnn_neurons: 2048 dnn_blocks: 2 dnn_neurons: 1024 return_hidden: True # For inference # Wav2vec2 encoder encoder_w2v2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2 source: !ref output_norm: True freeze: False freeze_feature_extractor: True save_path: !ref output_all_hiddens: False embedding: !new:speechbrain.nnet.embedding.Embedding num_embeddings: !ref embedding_dim: !ref # Attention-based RNN decoder. decoder: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: !ref input_size: !ref rnn_type: gru attn_type: location hidden_size: !ref attn_dim: 512 num_layers: !ref scaling: 1.0 channels: 10 kernel_size: 100 re_init: True dropout: !ref ctc_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref seq_lin: !new:speechbrain.nnet.linear.Linear input_size: !ref n_neurons: !ref log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: True tokenizer: !new:sentencepiece.SentencePieceProcessor model_file: 1000_unigram.model modules: encoder_w2v2: !ref embedding: !ref decoder: !ref ctc_lin: !ref seq_lin: !ref lm_model: !ref model: !new:torch.nn.ModuleList - [!ref , !ref , !ref , !ref , !ref ] ############################## Decoding & optimiser ############################ coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer vocab_size: !ref rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer language_model: !ref temperature: !ref scorer: !new:speechbrain.decoders.scorer.ScorerBuilder full_scorers: [!ref ] weights: coverage: !ref scorer_lm: !new:speechbrain.decoders.scorer.ScorerBuilder full_scorers: [!ref , !ref ] weights: rnnlm: !ref coverage: !ref test_search: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: !ref decoder: !ref linear: !ref bos_index: !ref eos_index: !ref min_decode_ratio: !ref max_decode_ratio: !ref beam_size: !ref eos_threshold: !ref using_max_attn_shift: !ref max_attn_shift: !ref temperature: !ref scorer: !ref ############################## Logging and Pretrainer ########################## pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer loadables: model: !ref lm: !ref paths: model: !ref /model.ckpt lm: !ref /lm.ckpt