poonehmousavi commited on
Commit
e149552
1 Parent(s): e464b8d

Upload hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +142 -0
hyperparams.yaml ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: VGG2 + LSTM + time pooling
3
+ # Augmentation: SpecAugment
4
+ # Authors: Titouan Parcollet, Mirco Ravanelli, Peter Plantinga, Ju-Chieh Chou,
5
+ # and Abdel HEBA 2020
6
+ # ################################
7
+ # Feature parameters (FBANKS etc)
8
+ sample_rate: 16000
9
+ n_fft: 400
10
+ n_mels: 80
11
+
12
+ # Model parameters
13
+ activation: !name:torch.nn.LeakyReLU
14
+ dropout: 0.15
15
+ cnn_blocks: 3
16
+ cnn_channels: (128, 200, 256)
17
+ inter_layer_pooling_size: (2, 2, 2)
18
+ cnn_kernelsize: (3, 3)
19
+ time_pooling_size: 4
20
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
21
+ rnn_layers: 5
22
+ rnn_neurons: 1024
23
+ rnn_bidirectional: True
24
+ dnn_blocks: 2
25
+ dnn_neurons: 1024
26
+ dec_neurons: 1024
27
+ output_neurons: 1000 # index(blank/eos/bos) = 0
28
+ joint_dim: 1024
29
+ blank_index: 0
30
+
31
+ # Outputs
32
+ output_neurons: 1000 # BPE size, index(blank/eos/bos) = 0
33
+ # Decoding parameters
34
+ # Be sure that the bos and eos index match with the BPEs ones
35
+ blank_index: 0
36
+ bos_index: 0
37
+ eos_index: 0
38
+
39
+ min_decode_ratio: 0.0
40
+ max_decode_ratio: 1.0
41
+ beam_size: 4
42
+ nbest: 1
43
+ # by default {state,expand}_beam = 2.3 as mention in paper
44
+ # https://arxiv.org/abs/1904.02619
45
+ state_beam: 2.3
46
+ expand_beam: 2.3
47
+
48
+
49
+
50
+ normalizer: !new:speechbrain.processing.features.InputNormalization
51
+ norm_type: global
52
+
53
+ compute_features: !new:speechbrain.lobes.features.Fbank
54
+ sample_rate: !ref <sample_rate>
55
+ n_fft: !ref <n_fft>
56
+ n_mels: !ref <n_mels>
57
+
58
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
59
+ input_shape: [null, null, !ref <n_mels>]
60
+ activation: !ref <activation>
61
+ dropout: !ref <dropout>
62
+ cnn_blocks: !ref <cnn_blocks>
63
+ cnn_channels: !ref <cnn_channels>
64
+ cnn_kernelsize: !ref <cnn_kernelsize>
65
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
66
+ time_pooling: True
67
+ using_2d_pooling: False
68
+ time_pooling_size: !ref <time_pooling_size>
69
+ rnn_class: !ref <rnn_class>
70
+ rnn_layers: !ref <rnn_layers>
71
+ rnn_neurons: !ref <rnn_neurons>
72
+ rnn_bidirectional: !ref <rnn_bidirectional>
73
+ rnn_re_init: True
74
+ dnn_blocks: !ref <dnn_blocks>
75
+ dnn_neurons: !ref <dnn_neurons>
76
+
77
+ enc_lin: !new:speechbrain.nnet.linear.Linear
78
+ input_size: !ref <dnn_neurons>
79
+ n_neurons: !ref <joint_dim>
80
+
81
+ emb: !new:speechbrain.nnet.embedding.Embedding
82
+ num_embeddings: !ref <output_neurons>
83
+ consider_as_one_hot: True
84
+ blank_id: !ref <blank_index>
85
+
86
+ dec: !new:speechbrain.nnet.RNN.GRU
87
+ input_shape: [null, null, !ref <output_neurons> - 1]
88
+ hidden_size: !ref <dec_neurons>
89
+ num_layers: 1
90
+ re_init: True
91
+
92
+ # For MTL with LM over the decoder
93
+ dec_lin: !new:speechbrain.nnet.linear.Linear
94
+ input_size: !ref <dec_neurons>
95
+ n_neurons: !ref <joint_dim>
96
+ bias: False
97
+
98
+ Tjoint: !new:speechbrain.nnet.transducer.transducer_joint.Transducer_joint
99
+ joint: sum # joint [sum | concat]
100
+ nonlinearity: !ref <activation>
101
+
102
+ transducer_lin: !new:speechbrain.nnet.linear.Linear
103
+ input_size: !ref <joint_dim>
104
+ n_neurons: !ref <output_neurons>
105
+ bias: False
106
+
107
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
108
+ apply_log: True
109
+
110
+ asr_model: !new:torch.nn.ModuleList
111
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <transducer_lin>]
112
+
113
+
114
+
115
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
116
+ # We compose the inference (encoder) pipeline.
117
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
118
+ input_shape: [null, null, !ref <n_mels>]
119
+ compute_features: !ref <compute_features>
120
+ normalize: !ref <normalizer>
121
+ model: !ref <enc>
122
+
123
+ decoder: !new:speechbrain.decoders.transducer.TransducerBeamSearcher
124
+ decode_network_lst: [!ref <emb>, !ref <dec>]
125
+ tjoint: !ref <Tjoint>
126
+ classifier_network: [!ref <transducer_lin>]
127
+ blank_id: !ref <blank_index>
128
+ beam_size: !ref <beam_size>
129
+ nbest: !ref <nbest>
130
+ state_beam: !ref <state_beam>
131
+ expand_beam: !ref <expand_beam>
132
+
133
+ modules:
134
+ normalizer: !ref <normalizer>
135
+ encoder: !ref <encoder>
136
+ decoder: !ref <decoder>
137
+
138
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
139
+ loadables:
140
+ normalizer: !ref <normalizer>
141
+ asr: !ref <asr_model>
142
+ tokenizer: !ref <tokenizer>