j-tobias commited on
Commit
f3d14a8
1 Parent(s): ecc69a8
Files changed (5) hide show
  1. README.md +1 -1
  2. createevalset.py +0 -0
  3. eval.py +0 -22
  4. model.py +16 -6
  5. test.v01.py +0 -25
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: VocalVenturer
3
  emoji: 💬
4
  colorFrom: purple
5
  colorTo: blue
 
1
  ---
2
+ title: ASR Model Comparison
3
  emoji: 💬
4
  colorFrom: purple
5
  colorTo: blue
createevalset.py DELETED
File without changes
eval.py DELETED
@@ -1,22 +0,0 @@
1
- from dataset import Dataset
2
- from model import Models
3
-
4
-
5
- def data(dataset):
6
- for i, item in enumerate(dataset):
7
- yield {**item["audio"], "reference": item["norm_text"]}
8
-
9
-
10
- def streamed_infernce(dataset, pipeline):
11
-
12
-
13
- # placeholders for predictions and references
14
- predictions = []
15
- references = []
16
-
17
- # run streamed inference
18
- for out in pipeline(data(dataset), batch_size=16):
19
- predictions.append(pipeline(out["text"]))
20
- references.append(out["reference"][0])
21
-
22
- return predictions, references
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.py CHANGED
@@ -2,7 +2,7 @@
2
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
3
  from transformers import pipeline
4
 
5
- import nemo.collections.asr as nemo_asr
6
 
7
  from dataset import Dataset
8
  from utils import data
@@ -44,8 +44,8 @@ class Model:
44
  self.model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
45
  self.processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
46
 
47
- elif option == "nvidia/stt_en_fastconformer_ctc_large":
48
- self.model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/stt_en_fastconformer_ctc_large")
49
 
50
  def select(self, option:str=None):
51
  if option not in self.options:
@@ -61,6 +61,8 @@ class Model:
61
  references, predictions = self._process_openai_whisper_tiny_en(dataset)
62
  elif self.selected == "facebook/s2t-medium-librispeech-asr":
63
  references, predictions = self._process_facebook_s2t_medium(dataset)
 
 
64
 
65
  return references, predictions
66
 
@@ -85,8 +87,6 @@ class Model:
85
 
86
  def _process_facebook_s2t_medium(self, DaTaSeT:Dataset):
87
 
88
-
89
-
90
  def map_to_pred(batch):
91
  features = self.processor(batch["audio"]["array"], sampling_rate=16000, padding=True, return_tensors="pt")
92
  input_features = features.input_features
@@ -109,4 +109,14 @@ class Model:
109
  predictions.append(sample['transcription'])
110
  references.append(sample[text_column])
111
 
112
- return references, predictions
 
 
 
 
 
 
 
 
 
 
 
2
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
3
  from transformers import pipeline
4
 
5
+ # import nemo.collections.asr as nemo_asr
6
 
7
  from dataset import Dataset
8
  from utils import data
 
44
  self.model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-librispeech-asr")
45
  self.processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-librispeech-asr", do_upper_case=True)
46
 
47
+ # elif option == "nvidia/stt_en_fastconformer_ctc_large":
48
+ # self.model = nemo_asr.models.EncDecCTCModelBPE.from_pretrained(model_name="nvidia/stt_en_fastconformer_ctc_large")
49
 
50
  def select(self, option:str=None):
51
  if option not in self.options:
 
61
  references, predictions = self._process_openai_whisper_tiny_en(dataset)
62
  elif self.selected == "facebook/s2t-medium-librispeech-asr":
63
  references, predictions = self._process_facebook_s2t_medium(dataset)
64
+ # elif self.selected == "nvidia/stt_en_fastconformer_ctc_large":
65
+ # references, predictions = self._process_facebook_s2t_medium(dataset)
66
 
67
  return references, predictions
68
 
 
87
 
88
  def _process_facebook_s2t_medium(self, DaTaSeT:Dataset):
89
 
 
 
90
  def map_to_pred(batch):
91
  features = self.processor(batch["audio"]["array"], sampling_rate=16000, padding=True, return_tensors="pt")
92
  input_features = features.input_features
 
109
  predictions.append(sample['transcription'])
110
  references.append(sample[text_column])
111
 
112
+ return references, predictions
113
+
114
+ def _process_stt_en_fastconformer_ctc_large(self, DaTaSeT:Dataset):
115
+
116
+
117
+ self.model.transcribe(['2086-149220-0033.wav'])
118
+
119
+ predictions = []
120
+ references = []
121
+
122
+ return references, predictions
test.v01.py DELETED
@@ -1,25 +0,0 @@
1
- from utils import hf_login, data, compute_wer
2
- from dataset import Dataset
3
- from model import Model
4
-
5
- hf_login()
6
-
7
-
8
-
9
-
10
-
11
-
12
- def run_tests (dataset_choice:str, model:str):
13
-
14
- MoDeL = Model()
15
- MoDeL.select(model)
16
- MoDeL.load()
17
- DaTaSeT = Dataset(100)
18
- DaTaSeT.load(dataset_choice)
19
- references, predictions = MoDeL.process(DaTaSeT)
20
- wer = compute_wer(references=references, predictions=predictions)
21
- return wer
22
-
23
-
24
- print("WER:", run_tests(dataset_choice="GigaSpeech", model="facebook/s2t-medium-librispeech-asr"))
25
-