ddwkim commited on
Commit
fd56653
1 Parent(s): ea0ac2e

Add asr.ckpt

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. asr.ckpt +3 -0
  3. hyperparams.yaml +45 -24
  4. normalizer.ckpt +3 -0
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  lm.ckpt filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  lm.ckpt filter=lfs diff=lfs merge=lfs -text
37
+ asr.ckpt filter=lfs diff=lfs merge=lfs -text
asr.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59a397ccda6a54e04e47695313779f987fd0bc1487f5764d7b190915d9d94c44
3
+ size 302867350
hyperparams.yaml CHANGED
@@ -1,13 +1,19 @@
 
 
1
  # Decoding parameters
2
  # Be sure that the bos and eos index match with the BPEs ones
 
3
  blank_index: 0
4
  bos_index: 0
5
  eos_index: 0
6
- lm_weight: 0.1
7
- beam_size: 4
8
  nbest: 1
9
- state_beam: 2.3
10
- expand_beam: 2.3
 
 
 
11
 
12
  sample_rate: 16000
13
  n_fft: 512
@@ -25,7 +31,7 @@ activation: !name:torch.nn.GELU
25
  output_neurons: 5000
26
  dec_dim: 512
27
 
28
- normalize: !new:speechbrain.processing.features.InputNormalization
29
  norm_type: global
30
 
31
  compute_features: !new:speechbrain.lobes.features.Fbank
@@ -43,7 +49,7 @@ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
43
  strides: (2, 2)
44
  residuals: (False, False)
45
 
46
- Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
47
  input_size: 640
48
  tgt_vocab: !ref <output_neurons>
49
  d_model: !ref <d_model>
@@ -59,7 +65,19 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
59
 
60
  # We must call an encoder wrapper so the decoder isn't run (we don't have any)
61
  enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
62
- transformer: !ref <Transformer>
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
  proj_dec: !new:speechbrain.nnet.linear.Linear
65
  input_size: !ref <dec_dim>
@@ -87,19 +105,15 @@ transducer_lin: !new:speechbrain.nnet.linear.Linear
87
  bias: False
88
 
89
  asr_model: !new:torch.nn.ModuleList
90
- - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <transducer_lin>]
91
 
92
- Beamsearcher: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
93
- decode_network_lst: [!ref <emb>, !ref <dec>, !ref <proj_dec>]
 
94
  tjoint: !ref <Tjoint>
95
- classifier_network: [!ref <transducer_lin>]
96
- blank_id: !ref <blank_index>
97
- beam_size: !ref <beam_size>
98
- nbest: !ref <nbest>
99
- lm_module: !ref <lm_model>
100
- lm_weight: !ref <lm_weight>
101
- state_beam: !ref <state_beam>
102
- expand_beam: !ref <expand_beam>
103
 
104
  lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
105
  output_neurons: !ref <output_neurons>
@@ -116,18 +130,25 @@ tokenizer: !new:sentencepiece.SentencePieceProcessor
116
 
117
  # We compose the inference (encoder) pipeline.
118
  encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
119
- input_shape: [null, null, !ref <n_mels>]
120
  compute_features: !ref <compute_features>
121
  normalize: !ref <normalizer>
122
- model: !ref <enc>
 
 
123
 
124
  modules:
125
- normalizer: !ref <normalizer>
126
- encoder: !ref <encoder>
127
- decoder: !ref <decoder>
 
 
 
 
 
128
 
129
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
130
  loadables:
131
- normalizer: !ref <normalizer>
132
  asr: !ref <asr_model>
 
133
  tokenizer: !ref <tokenizer>
 
1
+ transducer_beam_search: true
2
+
3
  # Decoding parameters
4
  # Be sure that the bos and eos index match with the BPEs ones
5
+ # Decoding parameters
6
  blank_index: 0
7
  bos_index: 0
8
  eos_index: 0
9
+ pad_index: 0
10
+ beam_size: 20
11
  nbest: 1
12
+ # by default {state,expand}_beam = 2.3 as mention in paper
13
+ # https://arxiv.org/abs/1904.02619
14
+ state_beam: 2.0
15
+ expand_beam: 2.0
16
+ lm_weight: 0.1
17
 
18
  sample_rate: 16000
19
  n_fft: 512
 
31
  output_neurons: 5000
32
  dec_dim: 512
33
 
34
+ normalizer: !new:speechbrain.processing.features.InputNormalization
35
  norm_type: global
36
 
37
  compute_features: !new:speechbrain.lobes.features.Fbank
 
49
  strides: (2, 2)
50
  residuals: (False, False)
51
 
52
+ transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
53
  input_size: 640
54
  tgt_vocab: !ref <output_neurons>
55
  d_model: !ref <d_model>
 
65
 
66
  # We must call an encoder wrapper so the decoder isn't run (we don't have any)
67
  enc: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
68
+ transformer: !ref <transformer>
69
+
70
+ # For MTL CTC over the encoder
71
+ proj_ctc: !new:speechbrain.nnet.linear.Linear
72
+ input_size: !ref <joint_dim>
73
+ n_neurons: !ref <output_neurons>
74
+
75
+ # Define some projection layers to make sure that enc and dec
76
+ # output dim are the same before joining
77
+ proj_enc: !new:speechbrain.nnet.linear.Linear
78
+ input_size: !ref <d_model>
79
+ n_neurons: !ref <joint_dim>
80
+ bias: False
81
 
82
  proj_dec: !new:speechbrain.nnet.linear.Linear
83
  input_size: !ref <dec_dim>
 
105
  bias: False
106
 
107
  asr_model: !new:torch.nn.ModuleList
108
+ - [!ref <CNN>, !ref <enc>, !ref <emb>, !ref <dec>, !ref <proj_enc>, !ref <proj_dec>, !ref <proj_ctc>, !ref <transducer_lin>]
109
 
110
+ decoder: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
111
+ decode_network_lst: !new:torch.nn.ModuleList
112
+ - [!ref <emb>, !ref <dec>, !ref <proj_dec>]
113
  tjoint: !ref <Tjoint>
114
+ classifier_network: !new:torch.nn.ModuleList
115
+ - [!ref <transducer_lin>]
116
+ blank_id: !ref <blank_index>
 
 
 
 
 
117
 
118
  lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
119
  output_neurons: !ref <output_neurons>
 
130
 
131
  # We compose the inference (encoder) pipeline.
132
  encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
133
+ input_shape: [null, null]
134
  compute_features: !ref <compute_features>
135
  normalize: !ref <normalizer>
136
+ cnn: !ref <CNN>
137
+ transformer_encoder: !ref <enc>
138
+ proj_enc: !ref <proj_enc>
139
 
140
  modules:
141
+ compute_features: !ref <compute_features>
142
+ normalizer: !ref <normalizer>
143
+ pre_transformer: !ref <CNN>
144
+ transformer: !ref <transformer>
145
+ asr_model: !ref <asr_model>
146
+ lm_model: !ref <lm_model>
147
+ encoder: !ref <encoder>
148
+ decoder: !ref <decoder>
149
 
150
  pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
151
  loadables:
 
152
  asr: !ref <asr_model>
153
+ lm: !ref <lm_model>
154
  tokenizer: !ref <tokenizer>
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61e5e95123865b4283aad2a880af44f8f574f3f170019e26b9ed2be45b00bfd7
3
+ size 2218