sangeet2020 commited on
Commit
b681042
1 Parent(s): 91af171

add model files

Browse files
Files changed (7) hide show
  1. .gitattributes +1 -0
  2. README.md +6 -14
  3. config.json +3 -0
  4. decoder.ckpt +3 -0
  5. encoder.ckpt +0 -0
  6. hyperparams.yaml +105 -0
  7. masknet.ckpt +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.psd filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -85,13 +85,15 @@ Please notice that we encourage you to read our tutorials and learn more about
85
  ### Transcribing your own audio files (in German)
86
 
87
  ```python
88
-
89
  from speechbrain.pretrained import WhisperASR
90
 
91
- asr_model = WhisperASR.from_hparams(source="speechbrain/rescuespeech_whisper", savedir="pretrained_models/rescuespeech_whisper")
92
- asr_model.transcribe_file("speechbrain/rescuespeech_whisper/example_de.wav")
93
-
94
 
 
 
 
95
  ```
96
  ### Inference on GPU
97
  To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
@@ -136,14 +138,4 @@ GitHub: https://github.com/speechbrain/speechbrain
136
 
137
 
138
 
139
- ```bash
140
- from speechbrain.pretrained import SepformerSeparation as Separator
141
- from speechbrain.pretrained import WhisperASR
142
-
143
- enh_model = Separator.from_hparams(source="CKPT+2023-06-24+21-49-17+00", savedir='pretrained_models/sepformer_rescuespeech', hparams_file='hyperparams_asr.yaml')
144
- asr_model = WhisperASR.from_hparams(source="CKPT+2023-06-24+21-49-17+00", savedir="pretrained_models/whisper_rescuespeech", hparams_file='hyperparams_asr.yaml')
145
 
146
- # For custom file, change the path accordingly
147
- est_sources = enh_model.separate_file(path='example_rescuespeech16k.wav')
148
- print(asr_model(est_sources[:, :, 0]))
149
- ```
 
85
  ### Transcribing your own audio files (in German)
86
 
87
  ```python
88
+ from speechbrain.pretrained import SepformerSeparation as Separator
89
  from speechbrain.pretrained import WhisperASR
90
 
91
+ enh_model = Separator.from_hparams(source="speechbrain/noisy-whisper-resucespeech", savedir='pretrained_models/noisy-whisper-resucespeech')
92
+ asr_model = WhisperASR.from_hparams(source="speechbrain/noisy-whisper-resucespeech", savedir="pretrained_models/noisy-whisper-resucespeech")
 
93
 
94
+ # For custom file, change the path accordingly
95
+ est_sources = enh_model.separate_file(path='example_rescuespeech16k.wav')
96
+ print(asr_model(est_sources[:, :, 0]))
97
  ```
98
  ### Inference on GPU
99
  To perform inference on the GPU, add `run_opts={"device":"cuda"}` when calling the `from_hparams` method.
 
138
 
139
 
140
 
 
 
 
 
 
 
141
 
 
 
 
 
config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "speechbrain_interface": "SepformerSeparation"
3
+ }
decoder.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00d272f965100f627a4a43d45dd919a7caf867372035139a91b8ece174c8b5f1
3
+ size 17195
encoder.ckpt CHANGED
Binary files a/encoder.ckpt and b/encoder.ckpt differ
 
hyperparams.yaml ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Generated 2023-06-24 from:
2
+ # /netscratch/sagar/thesis/speechbrain/recipes/RescueSpeech/Enhancement/joint-training/transformers/hparams/robust_asr_16k.yaml
3
+ # yamllint disable
4
+ # Model: wav2vec2 + DNN + CTC
5
+ # Augmentation: SpecAugment
6
+ # Authors: Sangeet Sagar 2023
7
+ # ################################
8
+
9
+ # URL for the biggest whisper model.
10
+ # URL for the biggest Fairseq english whisper model.
11
+ whisper_hub: openai/whisper-large-v2
12
+ language: german
13
+
14
+ ## Model parameters
15
+ sample_rate: 16000
16
+ freeze_whisper: false
17
+ freeze_encoder_only: false
18
+ freeze_encoder: true
19
+
20
+ # These values are only used for the searchers.
21
+ # They needs to be hardcoded and should not be changed with Whisper.
22
+ # They are used as part of the searching process.
23
+ # The bos token of the searcher will be timestamp_index
24
+ # and will be concatenated with the bos, language and task tokens.
25
+ timestamp_index: 50363
26
+ eos_index: 50257
27
+ bos_index: 50258
28
+
29
+ # Decoding parameters
30
+ min_decode_ratio: 0.0
31
+ max_decode_ratio: 1.0
32
+ test_beam_size: 8
33
+
34
+ num_spks: 1
35
+
36
+ # Enhancement model
37
+ Encoder: &id004 !new:speechbrain.lobes.models.dual_path.Encoder
38
+ kernel_size: 16
39
+ out_channels: 256
40
+
41
+ SBtfintra: &id001 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
42
+ num_layers: 8
43
+ d_model: 256
44
+ nhead: 8
45
+ d_ffn: 1024
46
+ dropout: 0
47
+ use_positional_encoding: true
48
+ norm_before: true
49
+
50
+ SBtfinter: &id002 !new:speechbrain.lobes.models.dual_path.SBTransformerBlock
51
+ num_layers: 8
52
+ d_model: 256
53
+ nhead: 8
54
+ d_ffn: 1024
55
+ dropout: 0
56
+ use_positional_encoding: true
57
+ norm_before: true
58
+
59
+ MaskNet: &id005 !new:speechbrain.lobes.models.dual_path.Dual_Path_Model
60
+ num_spks: 1
61
+ in_channels: 256
62
+ out_channels: 256
63
+ num_layers: 2
64
+ K: 250
65
+ intra_model: *id001
66
+ inter_model: *id002
67
+ norm: ln
68
+ linear_layer_after_inter_intra: false
69
+ skip_around_intra: true
70
+
71
+ # Whisper ASR and its decoder
72
+ Decoder: &id006 !new:speechbrain.lobes.models.dual_path.Decoder
73
+ in_channels: 256
74
+ out_channels: 1
75
+ kernel_size: 16
76
+ stride: 8
77
+ bias: false
78
+
79
+ whisper: &id003 !new:speechbrain.lobes.models.huggingface_whisper.HuggingFaceWhisper
80
+ source: !ref <whisper_hub>
81
+ freeze: !ref <freeze_whisper>
82
+ freeze_encoder: !ref <freeze_encoder>
83
+ save_path: whisper_checkpoints
84
+ encoder_only: False
85
+
86
+ decoder: !new:speechbrain.decoders.seq2seq.S2SWhisperGreedySearch
87
+ model: *id003
88
+ bos_index: 50363
89
+ eos_index: 50257
90
+ min_decode_ratio: 0.0
91
+ max_decode_ratio: 1.0
92
+
93
+ # Change the path to use a local model instead of the remote one
94
+ pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer
95
+ loadables:
96
+ encoder: !ref <Encoder>
97
+ masknet: !ref <MaskNet>
98
+ decoder: !ref <Decoder>
99
+ whisper: !ref <whisper>
100
+
101
+ modules:
102
+ encoder: *id004
103
+ masknet: *id005
104
+ decoder: *id006
105
+ whisper: *id003
masknet.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79872065eba79aa6b2b51ea21b918491e0f9e7a7f87eea8bd2d6fe9aa434c9d7
3
+ size 112839555