sinarashidi commited on
Commit
801e44e
1 Parent(s): ebe92f1

Upload hyperparams.yaml with huggingface_hub

Browse files
Files changed (1) hide show
  1. hyperparams.yaml +228 -0
hyperparams.yaml ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ###################################
2
+ # Experiment Parameters and setup #
3
+ ###################################
4
+ seed: 888
5
+ __set_seed: !apply:torch.manual_seed [888]
6
+ output_folder: results/s2ut/888
7
+ save_folder: results/s2ut/888/save
8
+ train_log: results/s2ut/888/train_log.txt
9
+ epochs: 100
10
+ use_tensorboard: true
11
+
12
+ progress_samples: true
13
+ progress_sample_path: results/s2ut/888/samples
14
+ progress_samples_interval: 1
15
+ progress_batch_sample_size: 4
16
+
17
+ evaluation_interval: 10
18
+
19
+ #################################
20
+ # Data files and pre-processing #
21
+ #################################
22
+ src_data_folder: /workspace/speechbrain/common_voice # e.g, /corpus/CommonVoice/fr (French Data)
23
+ tgt_data_folder: /workspace/speechbrain/cvss # e.g, /corpus/CV4/fr (English Data)
24
+ sample_rate: 16000
25
+
26
+ train_json: results/s2ut/888/save/train.json
27
+ valid_json: results/s2ut/888/save/valid.json
28
+ valid_small_json: results/s2ut/888/save/valid_small.json
29
+ test_json: results/s2ut/888/save/test.json
30
+ splits: [train, valid_small, valid, test]
31
+ skip_prep: false
32
+
33
+ # SSL model used to encode target features
34
+ encoder_source: facebook/hubert-base-ls960
35
+ layer: 6
36
+ kmeans_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
37
+ codes_folder: results/s2ut/888/save/codes
38
+ skip_extract: false
39
+
40
+ # Vocoder model used for evaluation
41
+ vocoder_source: speechbrain/tts-hifigan-unit-hubert-l6-k100-ljspeech
42
+ vocoder_download_path: results/s2ut/888/save/pretrained_models/vocoder
43
+
44
+ # ASR model used for evaluation
45
+ asr_source: speechbrain/asr-wav2vec2-librispeech
46
+ asr_download_path: results/s2ut/888/save/pretrained_models/asr
47
+
48
+ # Wav2vec2 encoder
49
+ wav2vec2_source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
50
+ wav2vec2_download_path: results/s2ut/888/save/pretrained_models
51
+
52
+ # wav2vec2 encoder specific parameters
53
+ wav2vec2_frozen: false
54
+ wav2vec2_freeze_steps: 10000
55
+
56
+ ####################### Training Parameters ####################################
57
+ lr: 0.0005
58
+ lr_wav2vec: 0.00001
59
+ loss_reduction: batchmean
60
+
61
+ # Outputs
62
+ # blank_index: 102
63
+ bos_index: 100
64
+ eos_index: 101
65
+ pad_index: 102
66
+ label_smoothing: 0.2
67
+
68
+ # Dynamic batching
69
+ sorting: random
70
+ num_workers: 4
71
+ dynamic_batching: true
72
+ max_batch_len: 80 # 40 GB GPU
73
+ num_bucket: 200
74
+
75
+ train_batch_size: 32 # if not using dynamic batching
76
+ valid_batch_size: 1
77
+
78
+ dynamic_batch_sampler:
79
+ max_batch_len: 80
80
+ num_buckets: 200
81
+ shuffle_ex: true # if true re-creates batches at each epoch shuffling examples.
82
+ batch_ordering: random
83
+ max_batch_ex: 128
84
+
85
+ train_dataloader_opts:
86
+ batch_size: 32
87
+ drop_last: false
88
+ num_workers: 4
89
+ collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
90
+ padding_kwargs:
91
+ value: 102
92
+
93
+ valid_dataloader_opts:
94
+ batch_size: 1
95
+ num_workers: 4
96
+ collate_fn: !name:speechbrain.dataio.batch.PaddedBatch
97
+ padding_kwargs:
98
+ value: 102
99
+
100
+ ################################
101
+ # Model Parameters and model #
102
+ ################################
103
+
104
+ # Feature parameters (W2V2 etc)
105
+ features_dim: 1024 # large wav2vec output dimension, for base replace by 768
106
+
107
+ # Length Regulator
108
+ enc_kernel_size: 3
109
+ enc_stride: 2
110
+
111
+ # Transformer
112
+ embedding_size: 512
113
+ d_model: 512
114
+ nhead: 8
115
+ num_encoder_layers: 0
116
+ num_decoder_layers: 6
117
+ d_ffn: 2048
118
+ transformer_dropout: 0.1
119
+ activation: &id001 !name:torch.nn.GELU
120
+ output_neurons: 103 # /!\ needs to be changed accordingly to the vocabulary
121
+ attention_type: RelPosMHAXL # "RelPosMHAXL" or "regularMHA"
122
+
123
+ # Decoding parameters
124
+ test_bs: 10
125
+ min_decode_ratio: 0.0
126
+ max_decode_ratio: 1.0
127
+
128
+ ############################## models ################################
129
+ wav2vec2: &id002 !new:speechbrain.lobes.models.huggingface_transformers.wav2vec2.Wav2Vec2
130
+ source: m3hrdadfi/wav2vec2-large-xlsr-persian-v3
131
+ output_norm: true ### Test in baseline_v2
132
+ freeze: false
133
+ freeze_feature_extractor: false
134
+ save_path: results/s2ut/888/save/pretrained_models
135
+ apply_spec_augment: true
136
+
137
+ enc: &id003 !new:speechbrain.nnet.CNN.Conv1d
138
+ input_shape: [null, null, 1024]
139
+ out_channels: 512
140
+ kernel_size: 3
141
+ stride: 2
142
+
143
+ transformer: &id004 !new:speechbrain.lobes.models.transformer.TransformerST.TransformerST
144
+ # yamllint disable-line rule:line-length
145
+ input_size: 512
146
+ tgt_vocab: 103
147
+ d_model: 512
148
+ nhead: 8
149
+ num_encoder_layers: 0
150
+ num_decoder_layers: 6
151
+ d_ffn: 2048
152
+ dropout: 0.1
153
+ activation: *id001
154
+ attention_type: RelPosMHAXL
155
+ normalize_before: true
156
+ causal: true
157
+
158
+ log_softmax: !new:speechbrain.nnet.activations.Softmax
159
+ apply_log: true
160
+
161
+ seq_lin: &id005 !new:speechbrain.nnet.linear.Linear
162
+
163
+ input_size: 512
164
+ n_neurons: 103
165
+
166
+ modules:
167
+ wav2vec2: *id002
168
+ enc: *id003
169
+ transformer: *id004
170
+ seq_lin: *id005
171
+ model: &id006 !new:torch.nn.ModuleList
172
+ - [*id003, *id004, *id005]
173
+ opt_class: !name:torch.optim.AdamW
174
+ lr: 0.0005
175
+ betas: (0.9, 0.98)
176
+
177
+ wav2vec_opt_class: !name:torch.optim.AdamW
178
+ lr: 0.00001
179
+
180
+ seq_cost: !name:speechbrain.nnet.losses.nll_loss
181
+ label_smoothing: 0.2
182
+ reduction: batchmean
183
+
184
+ noam_annealing: &id008 !new:speechbrain.nnet.schedulers.NoamScheduler
185
+ lr_initial: 0.0005
186
+ n_warmup_steps: 5000
187
+
188
+ wav2vec_annealing: &id009 !new:speechbrain.nnet.schedulers.NewBobScheduler
189
+ initial_value: 0.00001
190
+ improvement_threshold: 0.0025
191
+ annealing_factor: 0.98
192
+
193
+ #epoch object
194
+ epoch_counter: &id007 !new:speechbrain.utils.epoch_loop.EpochCounter
195
+ limit: 100
196
+
197
+ train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
198
+ save_file: results/s2ut/888/train_log.txt
199
+
200
+ valid_search: !new:speechbrain.decoders.seq2seq.S2STransformerGreedySearcher
201
+ modules: [*id004, *id005, null]
202
+ bos_index: 100
203
+ eos_index: 101
204
+ min_decode_ratio: 0.0
205
+ max_decode_ratio: 1.0
206
+ temperature: 1.0
207
+
208
+ test_search: !new:speechbrain.decoders.seq2seq.S2STransformerBeamSearcher
209
+ modules: [*id004, *id005]
210
+ bos_index: 100
211
+ eos_index: 101
212
+ min_decode_ratio: 0.0
213
+ max_decode_ratio: 1.0
214
+ beam_size: 10
215
+
216
+ acc_computer: !name:speechbrain.utils.Accuracy.AccuracyStats
217
+ bleu_computer: !name:speechbrain.utils.bleu.BLEUStats
218
+ merge_words: false
219
+
220
+ #checkpointer
221
+ checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
222
+ checkpoints_dir: results/s2ut/888/save
223
+ recoverables:
224
+ model: *id006
225
+ wav2vec2: *id002
226
+ counter: *id007
227
+ noam_scheduler: *id008
228
+ wav2vec_scheduler: *id009