Titouan commited on
Commit
e8b63e8
1 Parent(s): f627588
Files changed (1) hide show
  1. hyperparams.yaml +144 -0
hyperparams.yaml ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ASR with Transformer
3
+ # Encoder: Transformer Encoder
4
+ # Decoder: Transformer Decoder + (CTC/ATT joint) beamsearch + TransformerLM
5
+ # Tokens: unigram
6
+ # losses: CTC + KLdiv (Label Smoothing loss)
7
+ # Training: Librispeech 960h
8
+ # Authors: Jianyuan Zhong, Titouan Parcollet 2021
9
+ # ############################################################################
10
+
11
+ # Feature parameters
12
+ sample_rate: 16000
13
+ n_fft: 400
14
+ n_mels: 80
15
+
16
+ ####################### Model parameters ###########################
17
+ # Transformer
18
+ d_model: 768
19
+ nhead: 8
20
+ num_encoder_layers: 12
21
+ num_decoder_layers: 6
22
+ d_ffn: 3072
23
+ transformer_dropout: 0.0
24
+ activation: !name:torch.nn.GELU
25
+ output_neurons: 5000
26
+ vocab_size: 5000
27
+
28
+ # Outputs
29
+ blank_index: 0
30
+ label_smoothing: 0.1
31
+ pad_index: 0
32
+ bos_index: 1
33
+ eos_index: 2
34
+ unk_index: 0
35
+
36
+ # Decoding parameters
37
+ min_decode_ratio: 0.0
38
+ max_decode_ratio: 1.0
39
+ valid_search_interval: 10
40
+ valid_beam_size: 10
41
+ test_beam_size: 66
42
+ lm_weight: 0.60
43
+ ctc_weight_decode: 0.52
44
+
45
+ ############################## models ################################
46
+
47
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
48
+ input_shape: (8, 10, 80)
49
+ num_blocks: 3
50
+ num_layers_per_block: 1
51
+ out_channels: (128, 256, 512)
52
+ kernel_sizes: (3, 3, 1)
53
+ strides: (2, 2, 1)
54
+ residuals: (False, False, False)
55
+
56
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR
57
+ input_size: 10240
58
+ tgt_vocab: !ref <output_neurons>
59
+ d_model: !ref <d_model>
60
+ nhead: !ref <nhead>
61
+ num_encoder_layers: !ref <num_encoder_layers>
62
+ num_decoder_layers: !ref <num_decoder_layers>
63
+ d_ffn: !ref <d_ffn>
64
+ dropout: !ref <transformer_dropout>
65
+ activation: !ref <activation>
66
+ normalize_before: False
67
+
68
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
69
+ input_size: !ref <d_model>
70
+ n_neurons: !ref <output_neurons>
71
+
72
+ seq_lin: !new:speechbrain.nnet.linear.Linear
73
+ input_size: !ref <d_model>
74
+ n_neurons: !ref <output_neurons>
75
+
76
+ asr_encoder: !new:torch.nn.ModuleList
77
+ - [!ref <CNN>, !ref <Transformer>]
78
+
79
+ asr_model: !new:torch.nn.ModuleList
80
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
81
+
82
+ beam_searcher: !new:speechbrain.decoders.S2STransformerBeamSearch
83
+ modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
84
+ bos_index: !ref <bos_index>
85
+ eos_index: !ref <eos_index>
86
+ blank_index: !ref <blank_index>
87
+ min_decode_ratio: !ref <min_decode_ratio>
88
+ max_decode_ratio: !ref <max_decode_ratio>
89
+ beam_size: !ref <test_beam_size>
90
+ ctc_weight: !ref <ctc_weight_decode>
91
+ lm_weight: !ref <lm_weight>
92
+ lm_modules: !ref <lm_model>
93
+ temperature: 1.15
94
+ temperature_lm: 1.15
95
+ using_eos_threshold: False
96
+ length_normalization: True
97
+
98
+ log_softmax: !new:torch.nn.LogSoftmax
99
+ dim: -1
100
+
101
+ normalize: !new:speechbrain.processing.features.InputNormalization
102
+ norm_type: global
103
+ update_until_epoch: 4
104
+
105
+ compute_features: !new:speechbrain.lobes.features.Fbank
106
+ sample_rate: !ref <sample_rate>
107
+ n_fft: !ref <n_fft>
108
+ n_mels: !ref <n_mels>
109
+
110
+ # This is the Transformer LM that is used according to the Huggingface repository
111
+ # Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
112
+ # For more details about the model!
113
+ # NB: It has to match the pre-trained TransformerLM!!
114
+ lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
115
+ vocab: 5000
116
+ d_model: 768
117
+ nhead: 12
118
+ num_encoder_layers: 12
119
+ num_decoder_layers: 0
120
+ d_ffn: 3072
121
+ dropout: 0.0
122
+ activation: !name:torch.nn.GELU
123
+ normalize_before: False
124
+
125
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
126
+
127
+ asr_model: !new:torch.nn.ModuleList
128
+ - [!ref <enc>, !ref <emb>, !ref <dec>, !ref <ctc_lin>, !ref <seq_lin>]
129
+
130
+ modules:
131
+ compute_features: !ref <compute_features>
132
+ asr_encoder: !ref <enc>
133
+ asr_model: !ref <asr_model>
134
+ normalize: !ref <normalize>
135
+ lm_model: !ref <lm_model>
136
+ beam_searcher: !ref <beam_searcher>
137
+
138
+ # The pretrainer allows a mapping between pretrained files and instances that
139
+ # are declared in the yaml.
140
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
141
+ loadables:
142
+ asr: !ref <asr_model>
143
+ lm: !ref <lm_model>
144
+ tokenizer: !ref <tokenizer>