ddwkim commited on
Commit
80186c4
1 Parent(s): c3f0326

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +63 -47
hyperparams.yaml CHANGED
@@ -6,57 +6,54 @@
6
  # losses: CTC + KLdiv (Label Smoothing loss)
7
  # Training: KsponSpeech 965.2h
8
  # Based on the works of: Jianyuan Zhong, Titouan Parcollet 2021
9
- # Authors: Dongwon Kim, Dongwoo Kim 2021
10
  # ############################################################################
11
  # Seed needs to be set at top of yaml, before objects with parameters are made
 
12
 
13
  # Feature parameters
14
  sample_rate: 16000
15
- n_fft: 400
16
  n_mels: 80
17
 
18
  ####################### Model parameters ###########################
19
  # Transformer
20
- d_model: 256
21
- nhead: 4
22
  num_encoder_layers: 12
23
  num_decoder_layers: 6
24
  d_ffn: 2048
25
- transformer_dropout: 0.0
26
  activation: !name:torch.nn.GELU
27
  output_neurons: 5000
28
- vocab_size: 5000
29
 
30
  # Outputs
31
  blank_index: 0
 
32
  pad_index: 0
33
  bos_index: 1
34
  eos_index: 2
35
- unk_index: 0
36
 
37
  # Decoding parameters
38
  min_decode_ratio: 0.0
39
  max_decode_ratio: 1.0
40
- test_beam_size: 10
41
- lm_weight: 0.0
42
  ctc_weight_decode: 0.40
43
 
44
  ############################## models ################################
45
 
46
- normalizer: !new:speechbrain.processing.features.InputNormalization
47
- norm_type: global
48
-
49
  CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
50
  input_shape: (8, 10, 80)
51
- num_blocks: 3
52
  num_layers_per_block: 1
53
- out_channels: (64, 64, 64)
54
- kernel_sizes: (5, 5, 1)
55
- strides: (2, 2, 1)
56
- residuals: (False, False, True)
57
-
58
  Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
59
- input_size: 1280
60
  tgt_vocab: !ref <output_neurons>
61
  d_model: !ref <d_model>
62
  nhead: !ref <nhead>
@@ -70,20 +67,6 @@ Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.Transforme
70
  normalize_before: True
71
  causal: False
72
 
73
- # NB: It has to match the pre-trained TransformerLM!!
74
- lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM # yamllint disable-line rule:line-length
75
- vocab: !ref <output_neurons>
76
- d_model: 768
77
- nhead: 12
78
- num_encoder_layers: 12
79
- num_decoder_layers: 0
80
- d_ffn: 3072
81
- dropout: 0.0
82
- activation: !name:torch.nn.GELU
83
- normalize_before: False
84
-
85
- tokenizer: !new:sentencepiece.SentencePieceProcessor
86
-
87
  ctc_lin: !new:speechbrain.nnet.linear.Linear
88
  input_size: !ref <d_model>
89
  n_neurons: !ref <output_neurons>
@@ -92,25 +75,62 @@ seq_lin: !new:speechbrain.nnet.linear.Linear
92
  input_size: !ref <d_model>
93
  n_neurons: !ref <output_neurons>
94
 
95
- decoder: !new:speechbrain.decoders.S2STransformerBeamSearch
96
- modules: [!ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
97
- bos_index: !ref <bos_index>
 
 
98
  eos_index: !ref <eos_index>
99
  blank_index: !ref <blank_index>
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  min_decode_ratio: !ref <min_decode_ratio>
101
  max_decode_ratio: !ref <max_decode_ratio>
102
  beam_size: !ref <test_beam_size>
103
- ctc_weight: !ref <ctc_weight_decode>
104
- lm_weight: !ref <lm_weight>
105
- lm_modules: !ref <lm_model>
106
- temperature: 1.30
107
- temperature_lm: 1.30
108
  using_eos_threshold: False
109
  length_normalization: True
 
110
 
111
  log_softmax: !new:torch.nn.LogSoftmax
112
  dim: -1
113
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
115
  transformer: !ref <Transformer>
116
 
@@ -121,14 +141,10 @@ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
121
  cnn: !ref <CNN>
122
  transformer_encoder: !ref <Tencoder>
123
 
 
124
  asr_model: !new:torch.nn.ModuleList
125
  - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
126
 
127
- compute_features: !new:speechbrain.lobes.features.Fbank
128
- sample_rate: !ref <sample_rate>
129
- n_fft: !ref <n_fft>
130
- n_mels: !ref <n_mels>
131
-
132
  modules:
133
  compute_features: !ref <compute_features>
134
  normalizer: !ref <normalizer>
@@ -146,4 +162,4 @@ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
146
  normalizer: !ref <normalizer>
147
  asr: !ref <asr_model>
148
  lm: !ref <lm_model>
149
- tokenizer: !ref <tokenizer>
 
6
  # losses: CTC + KLdiv (Label Smoothing loss)
7
  # Training: KsponSpeech 965.2h
8
  # Based on the works of: Jianyuan Zhong, Titouan Parcollet 2021
9
+ # Authors: Dong Won Kim, Dongwoo Kim 2021, 2024
10
  # ############################################################################
11
  # Seed needs to be set at top of yaml, before objects with parameters are made
12
+ # ############################################################################
13
 
14
  # Feature parameters
15
  sample_rate: 16000
16
+ n_fft: 512
17
  n_mels: 80
18
 
19
  ####################### Model parameters ###########################
20
  # Transformer
21
+ d_model: 512
22
+ nhead: 8
23
  num_encoder_layers: 12
24
  num_decoder_layers: 6
25
  d_ffn: 2048
26
+ transformer_dropout: 0.1
27
  activation: !name:torch.nn.GELU
28
  output_neurons: 5000
 
29
 
30
  # Outputs
31
  blank_index: 0
32
+ label_smoothing: 0.1
33
  pad_index: 0
34
  bos_index: 1
35
  eos_index: 2
 
36
 
37
  # Decoding parameters
38
  min_decode_ratio: 0.0
39
  max_decode_ratio: 1.0
40
+ test_beam_size: 66
41
+ lm_weight: 0.60
42
  ctc_weight_decode: 0.40
43
 
44
  ############################## models ################################
45
 
 
 
 
46
  CNN: !new:speechbrain.lobes.models.convolution.ConvolutionFrontEnd
47
  input_shape: (8, 10, 80)
48
+ num_blocks: 2
49
  num_layers_per_block: 1
50
+ out_channels: (64, 32)
51
+ kernel_sizes: (3, 3)
52
+ strides: (2, 2)
53
+ residuals: (False, False)
54
+
55
  Transformer: !new:speechbrain.lobes.models.transformer.TransformerASR.TransformerASR # yamllint disable-line rule:line-length
56
+ input_size: 640
57
  tgt_vocab: !ref <output_neurons>
58
  d_model: !ref <d_model>
59
  nhead: !ref <nhead>
 
67
  normalize_before: True
68
  causal: False
69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  ctc_lin: !new:speechbrain.nnet.linear.Linear
71
  input_size: !ref <d_model>
72
  n_neurons: !ref <output_neurons>
 
75
  input_size: !ref <d_model>
76
  n_neurons: !ref <output_neurons>
77
 
78
+ transformerlm_scorer: !new:speechbrain.decoders.scorer.TransformerLMScorer
79
+ language_model: !ref <lm_model>
80
+ temperature: 1.15
81
+
82
+ ctc_scorer: !new:speechbrain.decoders.scorer.CTCScorer
83
  eos_index: !ref <eos_index>
84
  blank_index: !ref <blank_index>
85
+ ctc_fc: !ref <ctc_lin>
86
+
87
+ scorer: !new:speechbrain.decoders.scorer.ScorerBuilder
88
+ full_scorers: [!ref <transformerlm_scorer>, !ref <ctc_scorer>]
89
+ weights:
90
+ transformerlm: !ref <lm_weight>
91
+ ctc: !ref <ctc_weight_decode>
92
+
93
+
94
+ decoder: !new:speechbrain.decoders.S2STransformerBeamSearcher
95
+ modules: [!ref <Transformer>, !ref <seq_lin>]
96
+ bos_index: !ref <bos_index>
97
+ eos_index: !ref <eos_index>
98
  min_decode_ratio: !ref <min_decode_ratio>
99
  max_decode_ratio: !ref <max_decode_ratio>
100
  beam_size: !ref <test_beam_size>
101
+ temperature: 1.15
 
 
 
 
102
  using_eos_threshold: False
103
  length_normalization: True
104
+ scorer: !ref <scorer>
105
 
106
  log_softmax: !new:torch.nn.LogSoftmax
107
  dim: -1
108
 
109
+ normalizer: !new:speechbrain.processing.features.InputNormalization
110
+ norm_type: global
111
+
112
+ compute_features: !new:speechbrain.lobes.features.Fbank
113
+ sample_rate: !ref <sample_rate>
114
+ n_fft: !ref <n_fft>
115
+ n_mels: !ref <n_mels>
116
+
117
+ # This is the Transformer LM that is used according to the Huggingface repository
118
+ # Visit the HuggingFace model corresponding to the pretrained_lm_tokenizer_path
119
+ # For more details about the model!
120
+ # NB: It has to match the pre-trained TransformerLM!!
121
+ lm_model: !new:speechbrain.lobes.models.transformer.TransformerLM.TransformerLM
122
+ vocab: 5000
123
+ d_model: 768
124
+ nhead: 12
125
+ num_encoder_layers: 12
126
+ num_decoder_layers: 0
127
+ d_ffn: 3072
128
+ dropout: 0.0
129
+ activation: !name:torch.nn.GELU
130
+ normalize_before: False
131
+
132
+ tokenizer: !new:sentencepiece.SentencePieceProcessor
133
+
134
  Tencoder: !new:speechbrain.lobes.models.transformer.TransformerASR.EncoderWrapper
135
  transformer: !ref <Transformer>
136
 
 
141
  cnn: !ref <CNN>
142
  transformer_encoder: !ref <Tencoder>
143
 
144
+ # Models
145
  asr_model: !new:torch.nn.ModuleList
146
  - [!ref <CNN>, !ref <Transformer>, !ref <seq_lin>, !ref <ctc_lin>]
147
 
 
 
 
 
 
148
  modules:
149
  compute_features: !ref <compute_features>
150
  normalizer: !ref <normalizer>
 
162
  normalizer: !ref <normalizer>
163
  asr: !ref <asr_model>
164
  lm: !ref <lm_model>
165
+ tokenizer: !ref <tokenizer>