sangeet2020 commited on
Commit
b8a5c94
1 Parent(s): c367aaf

add yaml files

Browse files
Files changed (2) hide show
  1. asr.yaml +53 -0
  2. enhance.yaml +72 -0
asr.yaml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2023-06-24 from:
2
+ # /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/joint-training/transformers/hparams/robust_asr_16k.yaml
3
+ # yamllint disable
4
+ # Model: wav2vec2 + DNN + CTC
5
+ # Augmentation: SpecAugment
6
+ # Authors: Sangeet Sagar 2023
7
+ # ################################
8
+
9
+ # URL for the biggest whisper model.
10
+ # URL for the biggest Fairseq english whisper model.
11
+ whisper_hub: openai/whisper-large-v2
12
+ language: german
13
+
14
+ ## Model parameters
15
+ sample_rate: 16000
16
+ freeze_whisper: false
17
+ freeze_encoder_only: false
18
+ freeze_encoder: true
19
+
20
+ # These values are only used for the searchers.
21
+ # They needs to be hardcoded and should not be changed with Whisper.
22
+ # They are used as part of the searching process.
23
+ # The bos token of the searcher will be timestamp_index
24
+ # and will be concatenated with the bos, language and task tokens.
25
+ timestamp_index: 50363
26
+ eos_index: 50257
27
+ bos_index: 50258
28
+
29
+
30
+ # ASR model
31
+ whisper: &id003 !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper
32
+ source: !ref <whisper_hub>
33
+ freeze: !ref <freeze_whisper>
34
+ freeze_encoder: !ref <freeze_encoder>
35
+ save_path: whisper_checkpoints
36
+ encoder_only: False
37
+
38
+ decoder: &id006 !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch
39
+ model: *id003
40
+ bos_index: 50363
41
+ eos_index: 50257
42
+ min_decode_ratio: 0.0
43
+ max_decode_ratio: 1.0
44
+
45
+ # Change the path to use a local model instead of the remote one
46
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
47
+ loadables:
48
+ whisper: !ref <whisper>
49
+ decoder: !ref <decoder>
50
+
51
+ modules:
52
+ whisper: *id003
53
+ decoder: *id006
enhance.yaml ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2023-06-24 from:
2
+ # /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/joint-training/transformers/hparams/robust_asr_16k.yaml
3
+ # yamllint disable
4
+ # Model: wav2vec2 + DNN + CTC
5
+ # Augmentation: SpecAugment
6
+ # Authors: Sangeet Sagar 2023
7
+ # ################################
8
+
9
+ ## Model parameters
10
+ sample_rate: 16000
11
+
12
+ # Decoding parameters
13
+ min_decode_ratio: 0.0
14
+ max_decode_ratio: 1.0
15
+ test_beam_size: 8
16
+
17
+ num_spks: 1
18
+
19
+ # Enhancement model
20
+ Encoder: &id004 !new:speechbrain.lobes.models.dual_path.Encoder
21
+ kernel_size: 16
22
+ out_channels: 256
23
+
24
+ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
25
+ num_layers: 8
26
+ d_model: 256
27
+ nhead: 8
28
+ d_ffn: 1024
29
+ dropout: 0
30
+ use_positional_encoding: true
31
+ norm_before: true
32
+
33
+ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
34
+ num_layers: 8
35
+ d_model: 256
36
+ nhead: 8
37
+ d_ffn: 1024
38
+ dropout: 0
39
+ use_positional_encoding: true
40
+ norm_before: true
41
+
42
+ MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
43
+ num_spks: 1
44
+ in_channels: 256
45
+ out_channels: 256
46
+ num_layers: 2
47
+ K: 250
48
+ intra_model: *id001
49
+ inter_model: *id002
50
+ norm: ln
51
+ linear_layer_after_inter_intra: false
52
+ skip_around_intra: true
53
+
54
+ # Whisper ASR and its decoder
55
+ Decoder: &id006 !new:speechbrain.lobes.models.dual_path.Decoder
56
+ in_channels: 256
57
+ out_channels: 1
58
+ kernel_size: 16
59
+ stride: 8
60
+ bias: false
61
+
62
+ # Change the path to use a local model instead of the remote one
63
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
64
+ loadables:
65
+ encoder: !ref <Encoder>
66
+ masknet: !ref <MaskNet>
67
+ decoder: !ref <Decoder>
68
+
69
+ modules:
70
+ encoder: *id004
71
+ masknet: *id005
72
+ decoder: *id006