Adel-Moumen commited on
Commit
979a53a
1 Parent(s): 0e7fdb9

Delete hyperparams_develop.yaml

Browse files
Files changed (1) hide show
  1. hyperparams_develop.yaml +0 -168
hyperparams_develop.yaml DELETED
@@ -1,168 +0,0 @@
1
- # ############################################################################
2
- # Model: E2E ASR with attention-based ASR
3
- # Encoder: CRDNN model
4
- # Decoder: GRU + beamsearch + RNNLM
5
- # Tokens: BPE with unigram
6
- # Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga 2020, Adel Moumen 2024
7
- # ############################################################################
8
-
9
-
10
- # Feature parameters
11
- sample_rate: 16000
12
- n_fft: 400
13
- n_mels: 40
14
-
15
- # Model parameters
16
- activation: !name:torch.nn.LeakyReLU
17
- dropout: 0.15
18
- cnn_blocks: 2
19
- cnn_channels: (128, 256)
20
- inter_layer_pooling_size: (2, 2)
21
- cnn_kernelsize: (3, 3)
22
- time_pooling_size: 4
23
- rnn_class: !name:speechbrain.nnet.RNN.LSTM
24
- rnn_layers: 4
25
- rnn_neurons: 1024
26
- rnn_bidirectional: True
27
- dnn_blocks: 2
28
- dnn_neurons: 512
29
- emb_size: 128
30
- dec_neurons: 1024
31
- output_neurons: 1000 # index(blank/eos/bos) = 0
32
- blank_index: 0
33
-
34
- # Decoding parameters
35
- bos_index: 0
36
- eos_index: 0
37
- min_decode_ratio: 0.0
38
- max_decode_ratio: 1.0
39
- beam_size: 80
40
- eos_threshold: 1.5
41
- using_max_attn_shift: True
42
- max_attn_shift: 240
43
- lm_weight: 0.50
44
- coverage_penalty: 1.5
45
- temperature: 1.25
46
- temperature_lm: 1.25
47
-
48
- normalizer: !new:speechbrain.processing.features.InputNormalization
49
- norm_type: global
50
-
51
- compute_features: !new:speechbrain.lobes.features.Fbank
52
- sample_rate: !ref <sample_rate>
53
- n_fft: !ref <n_fft>
54
- n_mels: !ref <n_mels>
55
-
56
- enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
57
- input_shape: [null, null, !ref <n_mels>]
58
- activation: !ref <activation>
59
- dropout: !ref <dropout>
60
- cnn_blocks: !ref <cnn_blocks>
61
- cnn_channels: !ref <cnn_channels>
62
- cnn_kernelsize: !ref <cnn_kernelsize>
63
- inter_layer_pooling_size: !ref <inter_layer_pooling_size>
64
- time_pooling: True
65
- using_2d_pooling: False
66
- time_pooling_size: !ref <time_pooling_size>
67
- rnn_class: !ref <rnn_class>
68
- rnn_layers: !ref <rnn_layers>
69
- rnn_neurons: !ref <rnn_neurons>
70
- rnn_bidirectional: !ref <rnn_bidirectional>
71
- rnn_re_init: True
72
- dnn_blocks: !ref <dnn_blocks>
73
- dnn_neurons: !ref <dnn_neurons>
74
-
75
- emb: !new:speechbrain.nnet.embedding.Embedding
76
- num_embeddings: !ref <output_neurons>
77
- embedding_dim: !ref <emb_size>
78
-
79
- dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
80
- enc_dim: !ref <dnn_neurons>
81
- input_size: !ref <emb_size>
82
- rnn_type: gru
83
- attn_type: location
84
- hidden_size: !ref <dec_neurons>
85
- attn_dim: 1024
86
- num_layers: 1
87
- scaling: 1.0
88
- channels: 10
89
- kernel_size: 100
90
- re_init: True
91
- dropout: !ref <dropout>
92
-
93
- ctc_lin: !new:speechbrain.nnet.linear.Linear
94
- input_size: !ref <dnn_neurons>
95
- n_neurons: !ref <output_neurons>
96
-
97
- seq_lin: !new:speechbrain.nnet.linear.Linear
98
- input_size: !ref <dec_neurons>
99
- n_neurons: !ref <output_neurons>
100
-
101
- log_softmax: !new:speechbrain.nnet.activations.Softmax
102
- apply_log: True
103
-
104
- lm_model: !new:speechbrain.lobes.models.RNNLM.RNNLM
105
- output_neurons: !ref <output_neurons>
106
- embedding_dim: !ref <emb_size>
107
- activation: !name:torch.nn.LeakyReLU
108
- dropout: 0.0
109
- rnn_layers: 2
110
- rnn_neurons: 2048
111
- dnn_blocks: 1
112
- dnn_neurons: 512
113
- return_hidden: True # For inference
114
-
115
- tokenizer: !new:sentencepiece.SentencePieceProcessor
116
-
117
- asr_model: !new:torch.nn.ModuleList
118
- - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
119
-
120
- # We compose the inference (encoder) pipeline.
121
- encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
122
- input_shape: [null, null, !ref <n_mels>]
123
- compute_features: !ref <compute_features>
124
- normalize: !ref <normalizer>
125
- model: !ref <enc>
126
-
127
- # Scorer
128
- coverage_scorer: !new:speechbrain.decoders.scorer.CoverageScorer
129
- vocab_size: !ref <output_neurons>
130
-
131
- rnnlm_scorer: !new:speechbrain.decoders.scorer.RNNLMScorer
132
- language_model: !ref <lm_model>
133
- temperature: !ref <temperature_lm>
134
-
135
- scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
136
- full_scorers: [!ref <rnnlm_scorer>,
137
- !ref <coverage_scorer>]
138
- weights:
139
- rnnlm: !ref <lm_weight>
140
- coverage: !ref <coverage_penalty>
141
-
142
- decoder: !new:speechbrain.decoders.S2SRNNBeamSearcher
143
- embedding: !ref <emb>
144
- decoder: !ref <dec>
145
- linear: !ref <seq_lin>
146
- bos_index: !ref <bos_index>
147
- eos_index: !ref <eos_index>
148
- min_decode_ratio: !ref <min_decode_ratio>
149
- max_decode_ratio: !ref <max_decode_ratio>
150
- beam_size: !ref <beam_size>
151
- eos_threshold: !ref <eos_threshold>
152
- using_max_attn_shift: !ref <using_max_attn_shift>
153
- max_attn_shift: !ref <max_attn_shift>
154
- temperature: !ref <temperature>
155
- scorer: !ref <scorer>
156
-
157
- modules:
158
- normalizer: !ref <normalizer>
159
- encoder: !ref <encoder>
160
- decoder: !ref <decoder>
161
- lm_model: !ref <lm_model>
162
-
163
- pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
164
- loadables:
165
- normalizer: !ref <normalizer>
166
- asr: !ref <asr_model>
167
- lm: !ref <lm_model>
168
- tokenizer: !ref <tokenizer>