sinarashidi commited on
Commit
01a6e6b
1 Parent(s): 801e44e

Update hyperparams.yaml

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +70 -198
hyperparams.yaml CHANGED
@@ -1,114 +1,26 @@
1
- ###################################
2
- # Experiment Parameters and setup #
3
- ###################################
4
- seed: 888
5
- __set_seed: !apply:torch.manual_seed [888]
6
- output_folder: results/s2ut/888
7
- save_folder: results/s2ut/888/save
8
- train_log: results/s2ut/888/train_log.txt
9
- epochs: 100
10
- use_tensorboard: true
11
-
12
- progress_samples: true
13
- progress_sample_path: results/s2ut/888/samples
14
- progress_samples_interval: 1
15
- progress_batch_sample_size: 4
16
-
17
- evaluation_interval: 10
18
-
19
- #################################
20
- # Data files and pre-processing #
21
- #################################
22
- src_data_folder: /workspace/speechbrain/common_voice # e.g, /corpus/CommonVoice/fr (French Data)
23
- tgt_data_folder: /workspace/speechbrain/cvss # e.g, /corpus/CV4/fr (English Data)
24
  sample_rate: 16000
25
 
26
- train_json: results/s2ut/888/save/train.json
27
- valid_json: results/s2ut/888/save/valid.json
28
- valid_small_json: results/s2ut/888/save/valid_small.json
29
- test_json: results/s2ut/888/save/test.json
30
- splits: [train, valid_small, valid, test]
31
- skip_prep: false
32
-
33
- # SSL model used to encode target features
34
- encoder_source: facebook/hubert-base-ls960
35
- layer: 6
36
- kmeans_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
37
- codes_folder: results/s2ut/888/save/codes
38
- skip_extract: false
39
-
40
- # Vocoder model used for evaluation
41
- vocoder_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
42
- vocoder_download_path: results/s2ut/888/save/pretrained_models/vocoder
43
-
44
- # ASR model used for evaluation
45
- asr_source: speechbrain/asr-wav2vec2-librispeech
46
- asr_download_path: results/s2ut/888/save/pretrained_models/asr
47
-
48
- # Wav2vec2 encoder
49
- wav2vec2_source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
50
- wav2vec2_download_path: results/s2ut/888/save/pretrained_models
51
-
52
- # wav2vec2 encoder specific parameters
53
- wav2vec2_frozen: false
54
- wav2vec2_freeze_steps: 10000
55
-
56
- ####################### Training Parameters ####################################
57
- lr: 0.0005
58
- lr_wav2vec: 0.00001
59
- loss_reduction: batchmean
60
 
61
  # Outputs
62
- # blank_index: 102
 
63
  bos_index: 100
64
  eos_index: 101
65
  pad_index: 102
66
- label_smoothing: 0.2
67
-
68
- # Dynamic batching
69
- sorting: random
70
- num_workers: 4
71
- dynamic_batching: true
72
- max_batch_len: 80 # 40 GB GPU
73
- num_bucket: 200
74
-
75
- train_batch_size: 32 # if not using dynamic batching
76
- valid_batch_size: 1
77
-
78
- dynamic_batch_sampler:
79
- max_batch_len: 80
80
- num_buckets: 200
81
- shuffle_ex: true # if true re-creates batches at each epoch shuffling examples.
82
- batch_ordering: random
83
- max_batch_ex: 128
84
-
85
- train_dataloader_opts:
86
- batch_size: 32
87
- drop_last: false
88
- num_workers: 4
89
- collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
90
- padding_kwargs:
91
- value: 102
92
-
93
- valid_dataloader_opts:
94
- batch_size: 1
95
- num_workers: 4
96
- collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
97
- padding_kwargs:
98
- value: 102
99
-
100
- ################################
101
- # Model Parameters and model #
102
- ################################
103
-
104
- # Feature parameters (W2V2 etc)
105
- features_dim: 1024 # large wav2vec output dimension, for base replace by 768
106
 
107
  # Length Regulator
108
  enc_kernel_size: 3
109
  enc_stride: 2
110
 
111
- # Transformer
112
  embedding_size: 512
113
  d_model: 512
114
  nhead: 8
@@ -116,113 +28,73 @@ num_encoder_layers: 0
116
  num_decoder_layers: 6
117
  d_ffn: 2048
118
  transformer_dropout: 0.1
119
- activation: &id001 !name:torch.nn.GELU
120
- output_neurons: 103 # /!\ needs to be changed accordingly to the vocabulary
121
- attention_type: RelPosMHAXL # "RelPosMHAXL" or "regularMHA"
122
 
123
  # Decoding parameters
124
- test_bs: 10
125
  min_decode_ratio: 0.0
126
  max_decode_ratio: 1.0
127
 
128
- ############################## models ################################
129
- wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
130
- source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
131
- output_norm: true ### Test in baseline_v2
132
- freeze: false
133
- freeze_feature_extractor: false
134
- save_path: results/s2ut/888/save/pretrained_models
135
- apply_spec_augment: true
136
-
137
- enc: &id003 !new:speechbrain.nnet.CNN.Conv1d
138
- input_shape: [null, null, 1024]
139
- out_channels: 512
140
- kernel_size: 3
141
- stride: 2
142
-
143
- transformer: &id004 !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST
144
- # yamllint disable-line rule:line-length
145
- input_size: 512
146
- tgt_vocab: 103
147
- d_model: 512
148
- nhead: 8
149
- num_encoder_layers: 0
150
- num_decoder_layers: 6
151
- d_ffn: 2048
152
- dropout: 0.1
153
- activation: *id001
154
- attention_type: RelPosMHAXL
155
- normalize_before: true
156
- causal: true
157
 
158
  log_softmax: !new:speechbrain.nnet.activations.Softmax
159
- apply_log: true
 
 
 
 
 
 
 
160
 
161
- seq_lin: &id005 !new:speechbrain.nnet.linear.Linear
 
 
162
 
163
- input_size: 512
164
- n_neurons: 103
 
 
 
 
 
 
165
 
166
  modules:
167
- wav2vec2: *id002
168
- enc: *id003
169
- transformer: *id004
170
- seq_lin: *id005
171
- model: &id006 !new:torch.nn.ModuleList
172
- - [*id003, *id004, *id005]
173
- opt_class: !name:torch.optim.AdamW
174
- lr: 0.0005
175
- betas: (0.9, 0.98)
176
-
177
- wav2vec_opt_class: !name:torch.optim.AdamW
178
- lr: 0.00001
179
-
180
- seq_cost: !name:speechbrain.nnet.losses.nll_loss
181
- label_smoothing: 0.2
182
- reduction: batchmean
183
-
184
- noam_annealing: &id008 !new:speechbrain.nnet.schedulers.NoamScheduler
185
- lr_initial: 0.0005
186
- n_warmup_steps: 5000
187
-
188
- wav2vec_annealing: &id009 !new:speechbrain.nnet.schedulers.NewBobScheduler
189
- initial_value: 0.00001
190
- improvement_threshold: 0.0025
191
- annealing_factor: 0.98
192
-
193
- #epoch object
194
- epoch_counter: &id007 !new:speechbrain.utils.epoch_loop.EpochCounter
195
- limit: 100
196
-
197
- train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
198
- save_file: results/s2ut/888/train_log.txt
199
-
200
- valid_search: !new:speechbrain.decoders.seq2seq.S2STransformerGreedySearcher
201
- modules: [*id004, *id005, null]
202
- bos_index: 100
203
- eos_index: 101
204
- min_decode_ratio: 0.0
205
- max_decode_ratio: 1.0
206
- temperature: 1.0
207
-
208
- test_search: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
209
- modules: [*id004, *id005]
210
- bos_index: 100
211
- eos_index: 101
212
- min_decode_ratio: 0.0
213
- max_decode_ratio: 1.0
214
- beam_size: 10
215
-
216
- acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
217
- bleu_computer: !name:speechbrain.utils.bleu.BLEUStats
218
- merge_words: false
219
-
220
- #checkpointer
221
- checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
222
- checkpoints_dir: results/s2ut/888/save
223
- recoverables:
224
- model: *id006
225
- wav2vec2: *id002
226
- counter: *id007
227
- noam_scheduler: *id008
228
- wav2vec_scheduler: *id009
 
1
+ pretrained_path: sinarashidi/s2st_fa-en_cvss
2
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  sample_rate: 16000
4
 
5
+ # URL for the HuggingFace model we want to load as encoder
6
+ wav2vec2_hub: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  # Outputs
9
+ vocab_size: 103
10
+ blank_index: 102
11
  bos_index: 100
12
  eos_index: 101
13
  pad_index: 102
14
+ label_smoothing: 0.0
15
+
16
+ # Encoder
17
+ features_dim: 1024
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
  # Length Regulator
20
  enc_kernel_size: 3
21
  enc_stride: 2
22
 
23
+ # Transformer decoder
24
  embedding_size: 512
25
  d_model: 512
26
  nhead: 8
 
28
  num_decoder_layers: 6
29
  d_ffn: 2048
30
  transformer_dropout: 0.1
31
+ activation: !name:torch.nn.GELU
32
+ output_neurons: !ref <vocab_size>
33
+ attention_type: "RelPosMHAXL"
34
 
35
  # Decoding parameters
 
36
  min_decode_ratio: 0.0
37
  max_decode_ratio: 1.0
38
 
39
+ wav2vec2: !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
40
+ source: !ref <wav2vec2_hub>
41
+ output_norm: True
42
+ freeze: True
43
+ freeze_feature_extractor: True
44
+ apply_spec_augment : True
45
+ save_path: wav2vec2_checkpoints
46
+
47
+ length_regulator: !new:speechbrain.nnet.CNN.Conv1d
48
+ input_shape: [null, null, !ref <features_dim>]
49
+ out_channels: !ref <embedding_size>
50
+ kernel_size: !ref <enc_kernel_size>
51
+ stride: !ref <enc_stride>
52
+
53
+ transformer_decoder: !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST # yamllint disable-line rule:line-length
54
+ input_size: !ref <embedding_size>
55
+ tgt_vocab: !ref <output_neurons>
56
+ d_model: !ref <d_model>
57
+ nhead: !ref <nhead>
58
+ num_encoder_layers: !ref <num_encoder_layers>
59
+ num_decoder_layers: !ref <num_decoder_layers>
60
+ d_ffn: !ref <d_ffn>
61
+ dropout: !ref <transformer_dropout>
62
+ activation: !ref <activation>
63
+ attention_type: !ref <attention_type>
64
+ normalize_before: True
65
+ causal: False
 
 
66
 
67
  log_softmax: !new:speechbrain.nnet.activations.Softmax
68
+ apply_log: True
69
+
70
+ seq_lin: !new:speechbrain.nnet.linear.Linear
71
+ input_size: !ref <d_model>
72
+ n_neurons: !ref <output_neurons>
73
+
74
+ model: !new:torch.nn.ModuleList
75
+ - [!ref <length_regulator>, !ref <transformer_decoder>, !ref <seq_lin>]
76
 
77
+ encoder: !new:speechbrain.nnet.containers.LengthsCapableSequential
78
+ wav2vec2: !ref <wav2vec2>
79
+ length_regulator: !ref <length_regulator>
80
 
81
+ decoder_beamsearch: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
82
+ modules: [!ref <transformer_decoder>, !ref <seq_lin>]
83
+ bos_index: !ref <bos_index>
84
+ eos_index: !ref <eos_index>
85
+ min_decode_ratio: !ref <min_decode_ratio>
86
+ max_decode_ratio: !ref <max_decode_ratio>
87
+ beam_size: 10
88
+ temperature: 1.0
89
 
90
  modules:
91
+ encoder: !ref <encoder>
92
+ decoder: !ref <decoder_beamsearch>
93
+
94
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
95
+ loadables:
96
+ model: !ref <model>
97
+ wav2vec2: !ref <wav2vec2>
98
+ paths:
99
+ wav2vec2: !ref <pretrained_path>/wav2vec2.ckpt
100
+ model: !ref <pretrained_path>/model.ckpt