Titouan commited on
Commit
5c36e66
1 Parent(s): e1bcce3

pushing model

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +169 -0
hyperparams.yaml ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with attention-based ASR
3
+ # Encoder: CRDNN model
4
+ # Decoder: GRU + beamsearch + Transformer
5
+ # Tokens: BPE with unigram
6
+ # losses: CTC+ NLL
7
+ # Training: Librispeech 960h
8
+ # Authors: Ju-Chieh Chou, Mirco Ravanelli, Abdel Heba, Peter Plantinga, Samuele Cornell
9
+ # Sung-Lin Yeh, Titouan Parcollet 2020
10
+ # ############################################################################
11
+
12
+ # Feature parameters
13
+ sample_rate: 16000
14
+ n_fft: 400
15
+ n_mels: 40
16
+
17
+ # Model parameters
18
+ activation: !name:torch.nn.LeakyReLU
19
+ dropout: 0.15
20
+ cnn_blocks: 2
21
+ cnn_channels: (64, 128)
22
+ inter_layer_pooling_size: (2, 2)
23
+ cnn_kernelsize: (3, 3)
24
+ time_pooling_size: 4
25
+ rnn_class: !name:speechbrain.nnet.RNN.LSTM
26
+ rnn_layers: 4
27
+ rnn_neurons: 1024
28
+ rnn_bidirectional: True
29
+ dnn_blocks: 1
30
+ dnn_neurons: 1024
31
+ emb_size: 1024
32
+ dec_neurons: 1024
33
+ output_neurons: 5000 # Number of tokens (same as LM)
34
+ blank_index: 0
35
+ pad_index: -1
36
+ bos_index: 1
37
+ eos_index: 2
38
+ unk_index: 0
39
+
40
+ # Decoding parameters
41
+ min_decode_ratio: 0.0
42
+ max_decode_ratio: 1.0
43
+ beam_size: 40
44
+ eos_threshold: 1.5
45
+ using_max_attn_shift: True
46
+ max_attn_shift: 300
47
+ lm_weight: 0.80
48
+ ctc_weight_decode: 0.40
49
+ ctc_window_size: 200
50
+ coverage_penalty: 1.5
51
+ temperature: 1.0
52
+ temperature_lm: 1.0
53
+
54
+ normalize: !new:speechbrain.processing.features.InputNormalization
55
+ norm_type: global
56
+
57
+ compute_features: !new:speechbrain.lobes.features.Fbank
58
+ sample_rate: !ref <sample_rate>
59
+ n_fft: !ref <n_fft>
60
+ n_mels: !ref <n_mels>
61
+
62
+ enc: !new:speechbrain.lobes.models.CRDNN.CRDNN
63
+ input_shape: [null, null, !ref <n_mels>]
64
+ activation: !ref <activation>
65
+ dropout: !ref <dropout>
66
+ cnn_blocks: !ref <cnn_blocks>
67
+ cnn_channels: !ref <cnn_channels>
68
+ cnn_kernelsize: !ref <cnn_kernelsize>
69
+ inter_layer_pooling_size: !ref <inter_layer_pooling_size>
70
+ time_pooling: True
71
+ using_2d_pooling: False
72
+ time_pooling_size: !ref <time_pooling_size>
73
+ rnn_class: !ref <rnn_class>
74
+ rnn_layers: !ref <rnn_layers>
75
+ rnn_neurons: !ref <rnn_neurons>
76
+ rnn_bidirectional: !ref <rnn_bidirectional>
77
+ rnn_re_init: True
78
+ dnn_blocks: !ref <dnn_blocks>
79
+ dnn_neurons: !ref <dnn_neurons>
80
+ use_rnnp: True
81
+
82
+ emb: !new:speechbrain.nnet.embedding.Embedding
83
+ num_embeddings: !ref <output_neurons>
84
+ embedding_dim: !ref <emb_size>
85
+
86
+ dec: !new:speechbrain.nnet.RNN.AttentionalRNNDecoder
87
+ enc_dim: !ref <dnn_neurons>
88
+ input_size: !ref <emb_size>
89
+ rnn_type: gru
90
+ attn_type: location
91
+ hidden_size: !ref <dec_neurons>
92
+ attn_dim: 1024
93
+ num_layers: 1
94
+ scaling: 1.0
95
+ channels: 10
96
+ kernel_size: 100
97
+ re_init: True
98
+ dropout: !ref <dropout>
99
+
100
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
101
+ input_size: !ref <dnn_neurons>
102
+ n_neurons: !ref <output_neurons>
103
+
104
+ seq_lin: !new:speechbrain.nnet.linear.Linear
105
+ input_size: !ref <dec_neurons>
106
+ n_neurons: !ref <output_neurons>
107
+
108
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
109
+ apply_log: True
110
+
111
+ # This is the Transformer LM that is used according to the Huggingface repository
112
+ # Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
113
+ # For more details about the model!
114
+ # NB: It has to match the pre-trained TransformerLM!!
115
+ lm: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
116
+ vocab: 5000
117
+ d_model: 768
118
+ nhead: 12
119
+ num_encoder_layers: 12
120
+ num_decoder_layers: 0
121
+ d_ffn: 3072
122
+ dropout: 0.0
123
+ activation: !name:torch.nn.GELU
124
+ normalize_before: False
125
+
126
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
127
+
128
+ asr_model: !new:torch.nn.ModuleList
129
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
130
+
131
+ beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearchTransformerLM
132
+ embedding: !ref <emb>
133
+ decoder: !ref <dec>
134
+ linear: !ref <seq_lin>
135
+ ctc_linear: !ref <ctc_lin>
136
+ language_model: !ref <lm_model>
137
+ bos_index: !ref <bos_index>
138
+ eos_index: !ref <eos_index>
139
+ blank_index: !ref <blank_index>
140
+ min_decode_ratio: !ref <min_decode_ratio>
141
+ max_decode_ratio: !ref <max_decode_ratio>
142
+ beam_size: !ref <beam_size>
143
+ eos_threshold: !ref <eos_threshold>
144
+ using_max_attn_shift: !ref <using_max_attn_shift>
145
+ max_attn_shift: !ref <max_attn_shift>
146
+ coverage_penalty: !ref <coverage_penalty>
147
+ lm_weight: !ref <lm_weight>
148
+ ctc_weight: !ref <ctc_weight_decode>
149
+ ctc_window_size: !ref <ctc_window_size>
150
+ temperature: !ref <temperature>
151
+ temperature_lm: !ref <temperature_lm>
152
+
153
+ modules:
154
+ compute_features: !ref <compute_features>
155
+ asr_enc: !ref <enc>
156
+ asr_dec: !ref <dec>
157
+ ctc_lin: !ref <ctc_lin>
158
+ seq_lin: !ref <seq_lin>
159
+ normalize: !ref <normalize>
160
+ lm_model: !ref <lm_model>
161
+ beam_searcher: !ref <beam_searcher>
162
+
163
+ # The pretrainer allows a mapping between pretrained files and instances that
164
+ # are declared in the yaml.
165
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
166
+ loadables:
167
+ asr: !ref <asr_model>
168
+ lm: !ref <lm_model>
169
+ tokenizer: !ref <tokenizer>