Aku Rouhe commited on
Commit
1a037da
1 Parent(s): 7870a56

Flat format for pretrained system

Browse files
Files changed (5) hide show
  1. hyperparams.yaml +154 -0
  2. lm.ckpt +3 -0
  3. model.ckpt +3 -0
  4. normalizer.ckpt +3 -0
  5. tokenizer.ckpt +3 -0
hyperparams.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with attention-based ASR
3
+ # Encoder: CRDNN model
4
+ # Decoder: GRU + beamsearch + RNNLM
5
+ # Tokens: BPE with unigram
6
+ # Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga 2020
7
+ # ############################################################################
8
+
9
+
10
+ # Feature parameters
11
+ sample_rate: 16000
12
+ n_fft: 400
13
+ n_mels: 40
14
+
15
+ # Model parameters
16
+ activation: !name:torch.nn.LeakyReLU
17
+ dropout: 0.15
18
+ cnn_blocks: 2
19
+ cnn_channels: (128, 256)
20
+ inter_layer_pooling_size: (2, 2)
21
+ cnn_kernelsize: (3, 3)
22
+ time_pooling_size: 4
23
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
24
+ rnn_layers: 4
25
+ rnn_neurons: 1024
26
+ rnn_bidirectional: True
27
+ dnn_blocks: 2
28
+ dnn_neurons: 512
29
+ emb_size: 128
30
+ dec_neurons: 1024
31
+ output_neurons: 1000 # index(blank/eos/bos) = 0
32
+ blank_index: 0
33
+
34
+ # Decoding parameters
35
+ bos_index: 0
36
+ eos_index: 0
37
+ min_decode_ratio: 0.0
38
+ max_decode_ratio: 1.0
39
+ beam_size: 80
40
+ eos_threshold: 1.5
41
+ using_max_attn_shift: True
42
+ max_attn_shift: 240
43
+ lm_weight: 0.50
44
+ coverage_penalty: 1.5
45
+ temperature: 1.25
46
+ temperature_lm: 1.25
47
+
48
+ normalize: !new:speechbrain.processing.features.InputNormalization
49
+ norm_type: global
50
+
51
+ compute_features: !new:speechbrain.lobes.features.Fbank
52
+ sample_rate: !ref <sample_rate>
53
+ n_fft: !ref <n_fft>
54
+ n_mels: !ref <n_mels>
55
+
56
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
57
+ input_shape: [null, null, !ref <n_mels>]
58
+ activation: !ref <activation>
59
+ dropout: !ref <dropout>
60
+ cnn_blocks: !ref <cnn_blocks>
61
+ cnn_channels: !ref <cnn_channels>
62
+ cnn_kernelsize: !ref <cnn_kernelsize>
63
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
64
+ time_pooling: True
65
+ using_2d_pooling: False
66
+ time_pooling_size: !ref <time_pooling_size>
67
+ rnn_class: !ref <rnn_class>
68
+ rnn_layers: !ref <rnn_layers>
69
+ rnn_neurons: !ref <rnn_neurons>
70
+ rnn_bidirectional: !ref <rnn_bidirectional>
71
+ rnn_re_init: True
72
+ dnn_blocks: !ref <dnn_blocks>
73
+ dnn_neurons: !ref <dnn_neurons>
74
+
75
+ emb: !new:speechbrain.nnet.embedding.Embedding
76
+ num_embeddings: !ref <output_neurons>
77
+ embedding_dim: !ref <emb_size>
78
+
79
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
80
+ enc_dim: !ref <dnn_neurons>
81
+ input_size: !ref <emb_size>
82
+ rnn_type: gru
83
+ attn_type: location
84
+ hidden_size: !ref <dec_neurons>
85
+ attn_dim: 1024
86
+ num_layers: 1
87
+ scaling: 1.0
88
+ channels: 10
89
+ kernel_size: 100
90
+ re_init: True
91
+ dropout: !ref <dropout>
92
+
93
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
94
+ input_size: !ref <dnn_neurons>
95
+ n_neurons: !ref <output_neurons>
96
+
97
+ seq_lin: !new:speechbrain.nnet.linear.Linear
98
+ input_size: !ref <dec_neurons>
99
+ n_neurons: !ref <output_neurons>
100
+
101
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
102
+ apply_log: True
103
+
104
+ lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
105
+ output_neurons: !ref <num_asr_tokens>
106
+ embedding_dim: !ref <emb_size>
107
+ activation: !name:torch.nn.LeakyReLU
108
+ dropout: 0.0
109
+ rnn_layers: 2
110
+ rnn_neurons: 2048
111
+ dnn_blocks: 1
112
+ dnn_neurons: 512
113
+ return_hidden: True # For inference
114
+
115
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
116
+
117
+ asr_model: !new:torch.nn.ModuleList
118
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
119
+
120
+ beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearchLM
121
+ embedding: !ref <emb>
122
+ decoder: !ref <dec>
123
+ linear: !ref <seq_lin>
124
+ language_model: !ref <lm_model>
125
+ bos_index: !ref <bos_index>
126
+ eos_index: !ref <eos_index>
127
+ min_decode_ratio: !ref <min_decode_ratio>
128
+ max_decode_ratio: !ref <max_decode_ratio>
129
+ beam_size: !ref <beam_size>
130
+ eos_threshold: !ref <eos_threshold>
131
+ using_max_attn_shift: !ref <using_max_attn_shift>
132
+ max_attn_shift: !ref <max_attn_shift>
133
+ coverage_penalty: !ref <coverage_penalty>
134
+ lm_weight: !ref <lm_weight>
135
+ temperature: !ref <temperature>
136
+ temperature_lm: !ref <temperature_lm>
137
+
138
+
139
+ modules:
140
+ compute_features: !ref <compute_features>
141
+ normalize: !ref <normalize>
142
+ asr_model: !ref <asr_model>
143
+ asr_encoder: !ref <enc>
144
+ asr_decoder: !ref <dec>
145
+ lm_model: !ref <lm_model>
146
+ beam_searcher: !ref <beam_searcher>
147
+
148
+
149
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
150
+ loadables:
151
+ asr_model: !ref <asr_model>
152
+ lm_model: !ref <lm_model>
153
+ tokenizer: !ref <tokenizer>
154
+ save_dir: model_checkpoints
lm.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f73e243f5f0eb070a05a2069ba5b9014232e926384cc7d5ba24cde060c84997
3
+ size 212420087
model.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e795c7e18f3bab6bd5f47060ab852233deb33d7d550e989994c8683901e18d5
3
+ size 479555971
normalizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e11bfd7dbe13a266d13c00f6ff042a00fdbd40f3f5973928f9b49c33da32b512
3
+ size 1409
tokenizer.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37a6cba34cd520b33fd83612d5efc8ba7e351166541eb2726642bb3032234d31
3
+ size 253217