j-tobias commited on
Commit
234fe59
1 Parent(s): 8cfce12

added new dataset + time meassurement

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. processing.py +55 -50
app.py CHANGED
@@ -26,7 +26,7 @@ login(hf_token)
26
 
27
  # GENERAL OPTIONS FOR MODELS AND DATASETS
28
  MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
29
- DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
30
 
31
  # HELPER FUNCTIONS
32
  def get_card(selected_model:str)->str:
 
26
 
27
  # GENERAL OPTIONS FOR MODELS AND DATASETS
28
  MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
29
+ DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
30
 
31
  # HELPER FUNCTIONS
32
  def get_card(selected_model:str)->str:
processing.py CHANGED
@@ -9,6 +9,7 @@ import librosa
9
  import torch
10
  import numpy as np
11
  import pandas as pd
 
12
 
13
  N_SAMPLES = 30
14
 
@@ -25,13 +26,13 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
25
 
26
  if data_subset == "Common Voice":
27
  dataset, text_column = load_Common_Voice()
28
- elif data_subset == "VoxPopuli":
29
- dataset, text_column = load_Vox_Populi()
30
  elif data_subset == "Librispeech ASR clean":
31
  dataset, text_column = load_Librispeech_ASR_clean()
 
 
32
  elif data_subset == "OWN Recoding/Sample":
33
  sr, audio = own_audio
34
- audio = audio.astype(np.float32) / 32768.0
35
  print("AUDIO: ", type(audio), audio)
36
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
37
  else:
@@ -47,12 +48,24 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
47
 
48
  if data_subset == "OWN Recoding/Sample":
49
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
 
 
 
 
50
  transcription1 = model_compute(model1, processor1, sample, model_1)
 
 
 
 
 
51
  transcription2 = model_compute(model2, processor2, sample, model_2)
 
 
 
52
 
53
  transcriptions1 = [transcription1]
54
  transcriptions2 = [transcription2]
55
- references = [own_transcription]
56
 
57
  wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
58
  wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
@@ -60,9 +73,11 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
60
  results_md = f"""
61
  #### {model_1}
62
  - WER Score: {wer1}
 
63
 
64
  #### {model_2}
65
- - WER Score: {wer2}"""
 
66
 
67
  # Create the bar plot
68
  fig = go.Figure(
@@ -89,6 +104,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
89
  transcriptions2 = []
90
  WER1s = []
91
  WER2s = []
 
 
92
 
93
 
94
  counter = 0
@@ -99,34 +116,51 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
99
  references.append(sample[text_column])
100
 
101
  if model_1 == model_2:
 
102
  transcription = model_compute(model1, processor1, sample, model_1)
103
-
 
 
 
104
  transcriptions1.append(transcription)
105
  transcriptions2.append(transcription)
106
  else:
 
107
  transcription1 = model_compute(model1, processor1, sample, model_1)
108
- transcription2 = model_compute(model2, processor2, sample, model_2)
 
 
109
  transcriptions1.append(transcription1)
 
 
 
 
 
 
110
  transcriptions2.append(transcription2)
111
 
112
- WER1s.append(compute_wer([sample[text_column]], [transcription1]))
113
- WER2s.append(compute_wer([sample[text_column]], [transcription2]))
 
 
114
 
115
 
116
  results_md = f"""
117
  {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
118
 
119
  #### {model_1}
120
- - WER Score: {round(sum(WER1s)/len(WER1s), 2)}
 
121
 
122
  #### {model_2}
123
- - WER Score: {round(sum(WER2s)/len(WER2s), 2)}"""
 
124
 
125
  # Create the bar plot
126
  fig = go.Figure(
127
  data=[
128
- go.Bar(x=[f"{model_1}"], y=[sum(WER1s)/len(WER1s)], showlegend=False),
129
- go.Bar(x=[f"{model_2}"], y=[sum(WER2s)/len(WER2s)], showlegend=False),
130
  ]
131
  )
132
 
@@ -138,7 +172,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
138
  barmode="group",
139
  )
140
 
141
- df = pd.DataFrame({"references":references, "transcriptions 1":transcriptions1,"WER 1":WER1s,"transcriptions 2":transcriptions2,"WER 2":WER2s})
142
 
143
  yield results_md, fig, df
144
 
@@ -156,32 +190,19 @@ def load_Common_Voice():
156
  sample["text"] = sample["text"].lower()
157
  return dataset, text_column
158
 
159
- def load_Vox_Populi():
160
- # Load the dataset in streaming mode
161
- dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
162
-
163
- # Optionally, preview the first item to understand the structure (can be removed in production)
164
  print(next(iter(dataset)))
165
-
166
- # Take the first 120 examples to work with
167
- dataset = dataset.take(N_SAMPLES+20)
168
- text_column = "normalized_text"
169
-
170
- # Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
171
- dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
172
-
173
- # Take the first 100 examples after filtering
174
  dataset = dataset.take(N_SAMPLES)
175
-
176
- # Cast the 'audio' column to the desired sampling rate
177
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
178
-
179
- # Convert to list and return
180
  dataset = list(dataset)
 
 
181
  return dataset, text_column
182
 
183
- def load_Librispeech_ASR_clean():
184
- dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
185
  print(next(iter(dataset)))
186
  text_column = "text"
187
  dataset = dataset.take(N_SAMPLES)
@@ -191,22 +212,6 @@ def load_Librispeech_ASR_clean():
191
  sample["text"] = sample["text"].lower()
192
  return dataset, text_column
193
 
194
- def is_valid_sample(text, audio):
195
- # Check if 'normalized_text' is valid
196
- text = text.strip()
197
- if text == "" or text == "ignore time segment in scoring":
198
- return False
199
-
200
- # Check if the 'audio' array is valid (not empty and meets length criteria)
201
- if len(audio['array']) == 0: # Audio is empty
202
- return False
203
-
204
- # Optionally, check if the audio duration is within a certain range
205
- duration = audio['array'].size / audio['sampling_rate']
206
- if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds
207
- return False
208
-
209
- return True
210
 
211
 
212
  # MODEL LOADERS
 
9
  import torch
10
  import numpy as np
11
  import pandas as pd
12
+ import time
13
 
14
  N_SAMPLES = 30
15
 
 
26
 
27
  if data_subset == "Common Voice":
28
  dataset, text_column = load_Common_Voice()
 
 
29
  elif data_subset == "Librispeech ASR clean":
30
  dataset, text_column = load_Librispeech_ASR_clean()
31
+ elif data_subset == "Librispeech ASR other":
32
+ dataset, text_column = load_Librispeech_ASR_other()
33
  elif data_subset == "OWN Recoding/Sample":
34
  sr, audio = own_audio
35
+ audio = audio.astype(np.float32)
36
  print("AUDIO: ", type(audio), audio)
37
  audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
38
  else:
 
48
 
49
  if data_subset == "OWN Recoding/Sample":
50
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
51
+ inference_times1 = []
52
+ inference_times2 = []
53
+
54
+ time_start = time.time()
55
  transcription1 = model_compute(model1, processor1, sample, model_1)
56
+ time_stop = time.time()
57
+ duration = time_stop - time_start
58
+ inference_times1.append(duration)
59
+
60
+ time_start = time.time()
61
  transcription2 = model_compute(model2, processor2, sample, model_2)
62
+ time_stop = time.time()
63
+ duration = time_stop - time_start
64
+ inference_times2.append(duration)
65
 
66
  transcriptions1 = [transcription1]
67
  transcriptions2 = [transcription2]
68
+ references = [own_transcription.lower()]
69
 
70
  wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
71
  wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
 
73
  results_md = f"""
74
  #### {model_1}
75
  - WER Score: {wer1}
76
+ - Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
77
 
78
  #### {model_2}
79
+ - WER Score: {wer2}
80
+ - Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
81
 
82
  # Create the bar plot
83
  fig = go.Figure(
 
104
  transcriptions2 = []
105
  WER1s = []
106
  WER2s = []
107
+ inference_times1 = []
108
+ inference_times2 = []
109
 
110
 
111
  counter = 0
 
116
  references.append(sample[text_column])
117
 
118
  if model_1 == model_2:
119
+ time_start = time.time()
120
  transcription = model_compute(model1, processor1, sample, model_1)
121
+ time_stop = time.time()
122
+ duration = time_stop - time_start
123
+ inference_times1.append(duration)
124
+ inference_times2.append(duration)
125
  transcriptions1.append(transcription)
126
  transcriptions2.append(transcription)
127
  else:
128
+ time_start = time.time()
129
  transcription1 = model_compute(model1, processor1, sample, model_1)
130
+ time_stop = time.time()
131
+ duration = time_stop - time_start
132
+ inference_times1.append(duration)
133
  transcriptions1.append(transcription1)
134
+
135
+ time_start = time.time()
136
+ transcription2 = model_compute(model2, processor2, sample, model_2)
137
+ time_stop = time.time()
138
+ duration = time_stop - time_start
139
+ inference_times2.append(duration)
140
  transcriptions2.append(transcription2)
141
 
142
+ WER1s.append(round(compute_wer([sample[text_column]], [transcription1]),4))
143
+ WER2s.append(round(compute_wer([sample[text_column]], [transcription2]),4))
144
+ wer1 = round(sum(WER1s)/len(WER1s), 4)
145
+ wer2 = round(sum(WER2s)/len(WER2s), 4)
146
 
147
 
148
  results_md = f"""
149
  {i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
150
 
151
  #### {model_1}
152
+ - WER Score: {wer1}
153
+ - Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
154
 
155
  #### {model_2}
156
+ - WER Score: {wer2}
157
+ - Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
158
 
159
  # Create the bar plot
160
  fig = go.Figure(
161
  data=[
162
+ go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
163
+ go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
164
  ]
165
  )
166
 
 
172
  barmode="group",
173
  )
174
 
175
+ df = pd.DataFrame({"references":references, f"{model_1}":transcriptions1,"WER 1":WER1s,f"{model_2}":transcriptions2,"WER 2":WER2s})
176
 
177
  yield results_md, fig, df
178
 
 
190
  sample["text"] = sample["text"].lower()
191
  return dataset, text_column
192
 
193
+ def load_Librispeech_ASR_clean():
194
+ dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
 
 
 
195
  print(next(iter(dataset)))
196
+ text_column = "text"
 
 
 
 
 
 
 
 
197
  dataset = dataset.take(N_SAMPLES)
 
 
198
  dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 
 
199
  dataset = list(dataset)
200
+ for sample in dataset:
201
+ sample["text"] = sample["text"].lower()
202
  return dataset, text_column
203
 
204
+ def load_Librispeech_ASR_other():
205
+ dataset = load_dataset("librispeech_asr", "other", split="test", streaming=True, token=True, trust_remote_code=True)
206
  print(next(iter(dataset)))
207
  text_column = "text"
208
  dataset = dataset.take(N_SAMPLES)
 
212
  sample["text"] = sample["text"].lower()
213
  return dataset, text_column
214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
 
217
  # MODEL LOADERS