speechbrain
/

asr-wav2vec2-commonvoice-rw

@@ -1,118 +0,0 @@
-# ################################
-# Model: wav2vec2 + DNN + CTC/Attention
-# Augmentation: SpecAugment
-# Authors: Titouan Parcollet 2021
-# ################################
-sample_rate: 16000
-wav2vec2_hub: facebook/wav2vec2-large-xlsr-53
-# BPE parameters
-token_type: unigram  # ["unigram", "bpe", "char"]
-character_coverage: 1.0
-# Model parameters
-activation: !name:torch.nn.LeakyReLU
-dnn_layers: 2
-dnn_neurons: 1024
-emb_size: 128
-dec_neurons: 1024
-# Outputs
-output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0
-# Decoding parameters
-# Be sure that the bos and eos index match with the BPEs ones
-blank_index: 0
-bos_index: 1
-eos_index: 2
-min_decode_ratio: 0.0
-max_decode_ratio: 1.0
-beam_size: 10
-eos_threshold: 1.5
-using_max_attn_shift: True
-max_attn_shift: 140
-ctc_weight_decode: 0.0
-temperature: 1.50
-enc: !new:speechbrain.lobes.models.VanillaNN.VanillaNN
-    input_shape: [null, null, 1024]
-    activation: !ref <activation>
-    dnn_blocks: !ref <dnn_layers>
-    dnn_neurons: !ref <dnn_neurons>
-wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
-    source: !ref <wav2vec2_hub>
-    output_norm: True
-    freeze: True
-    save_path: model_checkpoints
-emb: !new:speechbrain.nnet.embedding.Embedding
-    num_embeddings: !ref <output_neurons>
-    embedding_dim: !ref <emb_size>
-dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
-    enc_dim: !ref <dnn_neurons>
-    input_size: !ref <emb_size>
-    rnn_type: gru
-    attn_type: location
-    hidden_size: 1024
-    attn_dim: 1024
-    num_layers: 1
-    scaling: 1.0
-    channels: 10
-    kernel_size: 100
-    re_init: True
-    dropout: 0.0
-ctc_lin: !new:speechbrain.nnet.linear.Linear
-    input_size: !ref <dnn_neurons>
-    n_neurons: !ref <output_neurons>
-seq_lin: !new:speechbrain.nnet.linear.Linear
-    input_size: !ref <dec_neurons>
-    n_neurons: !ref <output_neurons>
-log_softmax: !new:speechbrain.nnet.activations.Softmax
-    apply_log: True
-ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
-    blank_index: !ref <blank_index>
-seq_cost: !name:speechbrain.nnet.losses.nll_loss
-    label_smoothing: 0.1
-asr_model: !new:torch.nn.ModuleList
-    - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
-tokenizer: !new:sentencepiece.SentencePieceProcessor
-encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
-    wav2vec2: !ref <wav2vec2>
-    enc: !ref <enc>
-decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
-    embedding: !ref <emb>
-    decoder: !ref <dec>
-    linear: !ref <seq_lin>
-    ctc_linear: !ref <ctc_lin>
-    bos_index: !ref <bos_index>
-    eos_index: !ref <eos_index>
-    blank_index: !ref <blank_index>
-    min_decode_ratio: !ref <min_decode_ratio>
-    max_decode_ratio: !ref <max_decode_ratio>
-    beam_size: !ref <beam_size>
-    eos_threshold: !ref <eos_threshold>
-    using_max_attn_shift: !ref <using_max_attn_shift>
-    max_attn_shift: !ref <max_attn_shift>
-    temperature: !ref <temperature>
-modules:
-    encoder: !ref <encoder>
-    decoder: !ref <decoder>
-pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
-    loadables:
-        wav2vec2: !ref <wav2vec2>
-        asr: !ref <asr_model>
-        tokenizer: !ref <tokenizer>