bob80333 commited on
Commit
629bfce
1 Parent(s): 0ece4a3

Upload hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +157 -0
hyperparams.yaml ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ############################################################################
2
+ # Model: E2E ST JA->EN with Conformer
3
+ # Encoder: Conformer Encoder
4
+ # Decoder: Conformer Decoder + (CTC/ATT joint)
5
+ # Tokens: BPE
6
+ # losses: CTC
7
+ # Training: Custom JA->EN youtube scrape, ~600h
8
+ # Authors: Eric Engelhart, 2022
9
+ # ############################################################################
10
+
11
+ # Tokenier initialization
12
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
13
+
14
+ # Features
15
+ sample_rate: 16000
16
+ n_fft: 400
17
+ n_mels: 80
18
+
19
+ # normalization
20
+ normalizer: !new:speechbrain.processing.features.InputNormalization
21
+ norm_type: global
22
+
23
+ compute_features: !new:speechbrain.lobes.features.Fbank
24
+ sample_rate: !ref <sample_rate>
25
+ n_fft: !ref <n_fft>
26
+ n_mels: !ref <n_mels>
27
+
28
+ ####################### Model parameters ###########################
29
+ # Transformer
30
+ d_model: 384
31
+ nhead: 6
32
+ num_encoder_layers: 12
33
+ num_decoder_layers: 6
34
+ d_ffn: 1536
35
+ transformer_dropout: 0.1
36
+ activation: !name:torch.nn.GELU
37
+ output_neurons: 5000
38
+ vocab_size: 5000
39
+ attention_type: "regularMHA" # "RelPosMHAXL" or "regularMHA"
40
+ kernel_size: 15
41
+ encoder_module: conformer
42
+
43
+
44
+ # Outputs
45
+ blank_index: 0
46
+ label_smoothing: 0.1
47
+ pad_index: 0
48
+ bos_index: 1
49
+ eos_index: 2
50
+ unk_index: 0
51
+
52
+ # Decoding parameters
53
+ min_decode_ratio: 0.0
54
+ max_decode_ratio: 1.0
55
+ valid_search_interval: 2
56
+ valid_beam_size: 1
57
+ test_beam_size: 25
58
+
59
+ ############################## models ################################
60
+ CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
61
+ input_shape: (8, 10, 80)
62
+ num_blocks: 2
63
+ num_layers_per_block: 1
64
+ out_channels: (256, 256)
65
+ kernel_sizes: (3, 3)
66
+ strides: (2, 2)
67
+ residuals: (False, False)
68
+
69
+ Transformer: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
70
+ input_size: 5120
71
+ tgt_vocab: !ref <output_neurons>
72
+ d_model: !ref <d_model>
73
+ nhead: !ref <nhead>
74
+ num_encoder_layers: !ref <num_encoder_layers>
75
+ num_decoder_layers: !ref <num_decoder_layers>
76
+ d_ffn: !ref <d_ffn>
77
+ dropout: !ref <transformer_dropout>
78
+ activation: !ref <activation>
79
+ ctc_weight: 0
80
+ asr_weight: 0
81
+ mt_weight: 0
82
+ asr_tgt_vocab: !ref <output_neurons>
83
+ mt_src_vocab: !ref <output_neurons>
84
+ attention_type: !ref <attention_type>
85
+ kernel_size: !ref <kernel_size>
86
+ encoder_module: !ref <encoder_module>
87
+ normalize_before: True
88
+ causal: False
89
+ max_length: 5000
90
+
91
+ # only when multi-task setting is used
92
+ ctc_lin: !new:speechbrain.nnet.linear.Linear
93
+ input_size: !ref <d_model>
94
+ n_neurons: !ref <output_neurons>
95
+
96
+ seq_lin: !new:speechbrain.nnet.linear.Linear
97
+ input_size: !ref <d_model>
98
+ n_neurons: !ref <output_neurons>
99
+
100
+ # when asr-weight > 0 and ctc-weight < 1
101
+ asr_seq_lin: !new:speechbrain.nnet.linear.Linear
102
+ input_size: !ref <d_model>
103
+ n_neurons: !ref <vocab_size>
104
+
105
+
106
+ st_model: !new:torch.nn.ModuleList
107
+ - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>]
108
+
109
+
110
+ Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
111
+ transformer: !ref <Transformer>
112
+
113
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
114
+ input_shape: [null, null, !ref <n_mels>]
115
+ compute_features: !ref <compute_features>
116
+ normalize: !ref <normalizer>
117
+ cnn: !ref <CNN>
118
+ transformer_encoder: !ref <Tencoder>
119
+
120
+
121
+
122
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
123
+ modules: [!ref <Transformer>, !ref <seq_lin>, null]
124
+ bos_index: !ref <bos_index>
125
+ eos_index: !ref <eos_index>
126
+ blank_index: !ref <blank_index>
127
+ min_decode_ratio: !ref <min_decode_ratio>
128
+ max_decode_ratio: !ref <max_decode_ratio>
129
+ beam_size: !ref <test_beam_size>
130
+ using_eos_threshold: True
131
+ length_normalization: True
132
+ ctc_weight: 0
133
+ lm_weight: 0
134
+
135
+
136
+ modules:
137
+ compute_features: !ref <compute_features>
138
+ normalizer: !ref <normalizer>
139
+ pre_transformer: !ref <CNN>
140
+ Transformer: !ref <Transformer>
141
+ asr_model: !ref <st_model>
142
+ encoder: !ref <encoder>
143
+ decoder: !ref <decoder>
144
+
145
+
146
+ log_softmax: !new:torch.nn.LogSoftmax
147
+ dim: -1
148
+
149
+
150
+
151
+ # The pretrainer allows a mapping between pretrained files and instances that
152
+ # are declared in the yaml.
153
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
154
+ loadables:
155
+ tokenizer: !ref <tokenizer>
156
+ st: !ref <st_model>
157
+ normalizer: !ref <normalizer>