Titouan commited on
Commit
e15c027
1 Parent(s): 957b82c

pushing param file

Browse files

Files changed (1) hide show
  1. hyperparams.yaml +132 -0
hyperparams.yaml ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ################################
2
+ # Model: VGG2 + LSTM + time pooling
3
+ # Augmentation: SpecAugment
4
+ # Authors: Titouan Parcollet, Mirco Ravanelli, Peter Plantinga, Ju-Chieh Chou,
5
+ # and Abdel HEBA 2020
6
+ # ################################
7
+
8
+ # Feature parameters (FBANKS etc)
9
+ sample_rate: 16000
10
+ n_fft: 400
11
+ n_mels: 80
12
+
13
+ # Model parameters
14
+ activation: !name:torch.nn.LeakyReLU
15
+ dropout: 0.15
16
+ cnn_blocks: 3
17
+ cnn_channels: (128, 200, 256)
18
+ inter_layer_pooling_size: (2, 2, 2)
19
+ cnn_kernelsize: (3, 3)
20
+ time_pooling_size: 4
21
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
22
+ rnn_layers: 5
23
+ rnn_neurons: 1024
24
+ rnn_bidirectional: True
25
+ dnn_blocks: 2
26
+ dnn_neurons: 1024
27
+ emb_size: 128
28
+ dec_neurons: 1024
29
+
30
+ # Outputs
31
+ output_neurons: 500 # BPE size, index(blank/eos/bos) = 0
32
+
33
+ # Decoding parameters
34
+ # Be sure that the bos and eos index match with the BPEs ones
35
+ blank_index: 0
36
+ bos_index: 0
37
+ eos_index: 0
38
+ min_decode_ratio: 0.0
39
+ max_decode_ratio: 1.0
40
+ beam_size: 80
41
+ eos_threshold: 1.5
42
+ using_max_attn_shift: True
43
+ max_attn_shift: 140
44
+ ctc_weight_decode: 0.0
45
+ temperature: 1.50
46
+
47
+ normalize: !new:speechbrain.processing.features.InputNormalization
48
+ norm_type: global
49
+
50
+ compute_features: !new:speechbrain.lobes.features.Fbank
51
+ sample_rate: !ref <sample_rate>
52
+ n_fft: !ref <n_fft>
53
+ n_mels: !ref <n_mels>
54
+
55
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
56
+ input_shape: [null, null, !ref <n_mels>]
57
+ activation: !ref <activation>
58
+ dropout: !ref <dropout>
59
+ cnn_blocks: !ref <cnn_blocks>
60
+ cnn_channels: !ref <cnn_channels>
61
+ cnn_kernelsize: !ref <cnn_kernelsize>
62
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
63
+ time_pooling: True
64
+ using_2d_pooling: False
65
+ time_pooling_size: !ref <time_pooling_size>
66
+ rnn_class: !ref <rnn_class>
67
+ rnn_layers: !ref <rnn_layers>
68
+ rnn_neurons: !ref <rnn_neurons>
69
+ rnn_bidirectional: !ref <rnn_bidirectional>
70
+ rnn_re_init: True
71
+ dnn_blocks: !ref <dnn_blocks>
72
+ dnn_neurons: !ref <dnn_neurons>
73
+
74
+ emb: !new:speechbrain.nnet.embedding.Embedding
75
+ num_embeddings: !ref <output_neurons>
76
+ embedding_dim: !ref <emb_size>
77
+
78
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
79
+ enc_dim: !ref <dnn_neurons>
80
+ input_size: !ref <emb_size>
81
+ rnn_type: gru
82
+ attn_type: location
83
+ hidden_size: 1024
84
+ attn_dim: 1024
85
+ num_layers: 1
86
+ scaling: 1.0
87
+ channels: 10
88
+ kernel_size: 100
89
+ re_init: True
90
+ dropout: !ref <dropout>
91
+
92
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
93
+ input_size: !ref <dnn_neurons>
94
+ n_neurons: !ref <output_neurons>
95
+
96
+ seq_lin: !new:speechbrain.nnet.linear.Linear
97
+ input_size: !ref <dec_neurons>
98
+ n_neurons: !ref <output_neurons>
99
+
100
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
101
+ apply_log: True
102
+
103
+ asr_model: !new:torch.nn.ModuleList
104
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
105
+
106
+ beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher
107
+ embedding: !ref <emb>
108
+ decoder: !ref <dec>
109
+ linear: !ref <seq_lin>
110
+ bos_index: !ref <bos_index>
111
+ eos_index: !ref <eos_index>
112
+ min_decode_ratio: !ref <min_decode_ratio>
113
+ max_decode_ratio: !ref <max_decode_ratio>
114
+ beam_size: !ref <beam_size>
115
+ eos_threshold: !ref <eos_threshold>
116
+ using_max_attn_shift: !ref <using_max_attn_shift>
117
+ max_attn_shift: !ref <max_attn_shift>
118
+ temperature: !ref <temperature>
119
+
120
+ modules:
121
+ compute_features: !ref <compute_features>
122
+ normalize: !ref <normalize>
123
+ asr_model: !ref <asr_model>
124
+ asr_encoder: !ref <enc>
125
+ asr_decoder: !ref <dec>
126
+ beam_searcher: !ref <beam_searcher>
127
+
128
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
129
+ loadables:
130
+ asr: !ref <asr_model>
131
+ lm: !ref <lm_model>
132
+ tokenizer: !ref <tokenizer>