File size: 2,465 Bytes
418d5a5
5025284
418d5a5
04a099d
 
 
418d5a5
 
 
04a099d
418d5a5
 
 
5025284
418d5a5
5025284
 
 
 
 
04a099d
418d5a5
 
04a099d
418d5a5
 
 
 
 
 
74357e5
 
5025284
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418d5a5
 
5025284
 
 
 
418d5a5
 
5025284
 
418d5a5
 
5025284
418d5a5
 
5025284
418d5a5
 
5025284
418d5a5
 
 
 
 
 
5025284
418d5a5
 
5025284
 
 
 
418d5a5
 
5025284
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# ################################
# Model: wav2vec2 + DNN + CTC
# Augmentation: SpecAugment
# Authors: 
# Sung-Lin Yeh 2021
# Pooneh Mousavi 2023
# ################################

# BPE parameters
token_type: unigram  # ["unigram", "bpe", "char"]
character_coverage: 1.0

# Model parameters
# activation: !name:torch.nn.LeakyReLU
dnn_neurons: 1024
wav2vec_output_dim: 1024
dropout: 0.15

sample_rate: 16000

wav2vec2_hub: facebook/wav2vec2-large-lv60

# Outputs
output_neurons: 1000  # BPE size, index(blank/eos/bos) = 0

# Decoding parameters
# Be sure that the bos and eos index match with the BPEs ones
blank_index: 0
bos_index: 1
eos_index: 2

enc: !new:speechbrain.nnet.containers.Sequential
  input_shape: [null, null, !ref <wav2vec_output_dim>]
  linear1: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn1: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation: !new:torch.nn.LeakyReLU
  drop: !new:torch.nn.Dropout
    p: !ref <dropout>
  linear2: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn2: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation2: !new:torch.nn.LeakyReLU
  drop2: !new:torch.nn.Dropout
    p: !ref <dropout>
  linear3: !name:speechbrain.nnet.linear.Linear
    n_neurons: !ref <dnn_neurons>
    bias: True
  bn3: !name:speechbrain.nnet.normalization.BatchNorm1d
  activation3: !new:torch.nn.LeakyReLU

wav2vec2: !new:speechbrain.lobes.models.huggingface_wav2vec.HuggingFaceWav2Vec2
  source: !ref <wav2vec2_hub>
  output_norm: True
  freeze: True
  save_path: wav2vec2_checkpoint

ctc_lin: !new:speechbrain.nnet.linear.Linear
  input_size: !ref <dnn_neurons>
  n_neurons: !ref <output_neurons>

log_softmax: !new:speechbrain.nnet.activations.Softmax
  apply_log: True

ctc_cost: !name:speechbrain.nnet.losses.ctc_loss
  blank_index: !ref <blank_index>

asr_model: !new:torch.nn.ModuleList
    - [!ref <enc>, !ref <ctc_lin>]

tokenizer: !new:sentencepiece.SentencePieceProcessor

encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
    wav2vec2: !ref <wav2vec2>
    enc: !ref <enc>
    ctc_lin: !ref <ctc_lin>

modules:
  encoder: !ref <encoder>

decoding_function: !name:speechbrain.decoders.ctc_greedy_decode
    blank_id: !ref <blank_index>

pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
   loadables:
      wav2vec2: !ref <wav2vec2>
      asr: !ref <asr_model>
      tokenizer: !ref <tokenizer>