AndyGo commited on
Commit
99d87de
1 Parent(s): 4adac32

Upload hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +166 -0
hyperparams.yaml ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # by HuggingFace
2
+ # ############################################################################
3
+ # Model: E2E ASR with attention-based ASR
4
+ # Encoder: CRDNN model
5
+ # Decoder: GRU + beamsearch + RNNLM
6
+ # Tokens: BPE with unigram
7
+ # Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga 2020
8
+ # ############################################################################
9
+
10
+
11
+ # Feature parameters
12
+ sample_rate: 16000
13
+ n_fft: 400
14
+ n_mels: 40
15
+
16
+ # Model parameters
17
+ activation: !name:torch.nn.LeakyReLU
18
+ dropout: 0.15
19
+ cnn_blocks: 2
20
+ cnn_channels: (128, 256)
21
+ inter_layer_pooling_size: (2, 2)
22
+ cnn_kernelsize: (3, 3)
23
+ time_pooling_size: 4
24
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
25
+ rnn_layers: 4
26
+ rnn_neurons: 1024
27
+ rnn_bidirectional: True
28
+ dnn_blocks: 2
29
+ dnn_neurons: 512
30
+ emb_size: 128
31
+ dec_neurons: 1024
32
+ output_neurons: 1000 # index(blank/eos/bos) = 0
33
+ blank_index: 0
34
+
35
+ # Decoding parameters
36
+ bos_index: 0
37
+ eos_index: 0
38
+ min_decode_ratio: 0.0
39
+ max_decode_ratio: 1.0
40
+ beam_size: 80
41
+ eos_threshold: 1.5
42
+ using_max_attn_shift: True
43
+ max_attn_shift: 240
44
+ lm_weight: 0.50
45
+ coverage_penalty: 1.5
46
+ temperature: 1.25
47
+ temperature_lm: 1.25
48
+
49
+ normalizer: !new:speechbrain.processing.features.InputNormalization
50
+ norm_type: global
51
+
52
+ compute_features: !new:speechbrain.lobes.features.Fbank
53
+ sample_rate: !ref <sample_rate>
54
+ n_fft: !ref <n_fft>
55
+ n_mels: !ref <n_mels>
56
+
57
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
58
+ input_shape: [null, null, !ref <n_mels>]
59
+ activation: !ref <activation>
60
+ dropout: !ref <dropout>
61
+ cnn_blocks: !ref <cnn_blocks>
62
+ cnn_channels: !ref <cnn_channels>
63
+ cnn_kernelsize: !ref <cnn_kernelsize>
64
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
65
+ time_pooling: True
66
+ using_2d_pooling: False
67
+ time_pooling_size: !ref <time_pooling_size>
68
+ rnn_class: !ref <rnn_class>
69
+ rnn_layers: !ref <rnn_layers>
70
+ rnn_neurons: !ref <rnn_neurons>
71
+ rnn_bidirectional: !ref <rnn_bidirectional>
72
+ rnn_re_init: True
73
+ dnn_blocks: !ref <dnn_blocks>
74
+ dnn_neurons: !ref <dnn_neurons>
75
+
76
+ emb: !new:speechbrain.nnet.embedding.Embedding
77
+ num_embeddings: !ref <output_neurons>
78
+ embedding_dim: !ref <emb_size>
79
+
80
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
81
+ enc_dim: !ref <dnn_neurons>
82
+ input_size: !ref <emb_size>
83
+ rnn_type: gru
84
+ attn_type: location
85
+ hidden_size: !ref <dec_neurons>
86
+ attn_dim: 1024
87
+ num_layers: 1
88
+ scaling: 1.0
89
+ channels: 10
90
+ kernel_size: 100
91
+ re_init: True
92
+ dropout: !ref <dropout>
93
+
94
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
95
+ input_size: !ref <dnn_neurons>
96
+ n_neurons: !ref <output_neurons>
97
+
98
+ seq_lin: !new:speechbrain.nnet.linear.Linear
99
+ input_size: !ref <dec_neurons>
100
+ n_neurons: !ref <output_neurons>
101
+
102
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
103
+ apply_log: True
104
+
105
+ lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
106
+ output_neurons: !ref <output_neurons>
107
+ embedding_dim: !ref <emb_size>
108
+ activation: !name:torch.nn.LeakyReLU
109
+ dropout: 0.0
110
+ rnn_layers: 2
111
+ rnn_neurons: 2048
112
+ dnn_blocks: 1
113
+ dnn_neurons: 512
114
+ return_hidden: True # For inference
115
+
116
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
117
+
118
+ asr_model: !new:torch.nn.ModuleList
119
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
120
+
121
+ # We compose the inference (encoder) pipeline.
122
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
123
+ input_shape: [null, null, !ref <n_mels>]
124
+ compute_features: !ref <compute_features>
125
+ normalize: !ref <normalizer>
126
+ model: !ref <enc>
127
+
128
+ decoder: !new:speechbrain.decoders.S2SRNNBeamSearchLM
129
+ embedding: !ref <emb>
130
+ decoder: !ref <dec>
131
+ linear: !ref <seq_lin>
132
+ language_model: !ref <lm_model>
133
+ bos_index: !ref <bos_index>
134
+ eos_index: !ref <eos_index>
135
+ min_decode_ratio: !ref <min_decode_ratio>
136
+ max_decode_ratio: !ref <max_decode_ratio>
137
+ beam_size: !ref <beam_size>
138
+ eos_threshold: !ref <eos_threshold>
139
+ using_max_attn_shift: !ref <using_max_attn_shift>
140
+ max_attn_shift: !ref <max_attn_shift>
141
+ coverage_penalty: !ref <coverage_penalty>
142
+ lm_weight: !ref <lm_weight>
143
+ temperature: !ref <temperature>
144
+ temperature_lm: !ref <temperature_lm>
145
+
146
+
147
+ modules:
148
+ normalizer: !ref <normalizer>
149
+ encoder: !ref <encoder>
150
+ decoder: !ref <decoder>
151
+ lm_model: !ref <lm_model>
152
+
153
+ pretrained_path: AndyGo/speechbrain-asr-crdnn-rnnlm-buriy-audiobooks-2-val-mix
154
+ #pretrained_path_local: /content/pretrained_models
155
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
156
+ loadables:
157
+ normalizer: !ref <normalizer>
158
+ asr: !ref <asr_model>
159
+ lm: !ref <lm_model>
160
+ tokenizer: !ref <tokenizer>
161
+ paths:
162
+ lm: !ref <pretrained_path>/lm.ckpt
163
+ #tokenizer: !ref <pretrained_path>/1000_unigram.model
164
+ tokenizer: !ref <pretrained_path>/tokenizer.ckpt
165
+ asr: !ref <pretrained_path>/asr.ckpt
166
+ normalizer: !ref <pretrained_path>/normalizer.ckpt