Spaces:
Sleeping
Sleeping
j-tobias
commited on
Commit
•
234fe59
1
Parent(s):
8cfce12
added new dataset + time meassurement
Browse files- app.py +1 -1
- processing.py +55 -50
app.py
CHANGED
@@ -26,7 +26,7 @@ login(hf_token)
|
|
26 |
|
27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
29 |
-
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "OWN Recoding/Sample"]
|
30 |
|
31 |
# HELPER FUNCTIONS
|
32 |
def get_card(selected_model:str)->str:
|
|
|
26 |
|
27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
29 |
+
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
|
30 |
|
31 |
# HELPER FUNCTIONS
|
32 |
def get_card(selected_model:str)->str:
|
processing.py
CHANGED
@@ -9,6 +9,7 @@ import librosa
|
|
9 |
import torch
|
10 |
import numpy as np
|
11 |
import pandas as pd
|
|
|
12 |
|
13 |
N_SAMPLES = 30
|
14 |
|
@@ -25,13 +26,13 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
25 |
|
26 |
if data_subset == "Common Voice":
|
27 |
dataset, text_column = load_Common_Voice()
|
28 |
-
elif data_subset == "VoxPopuli":
|
29 |
-
dataset, text_column = load_Vox_Populi()
|
30 |
elif data_subset == "Librispeech ASR clean":
|
31 |
dataset, text_column = load_Librispeech_ASR_clean()
|
|
|
|
|
32 |
elif data_subset == "OWN Recoding/Sample":
|
33 |
sr, audio = own_audio
|
34 |
-
audio = audio.astype(np.float32)
|
35 |
print("AUDIO: ", type(audio), audio)
|
36 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
37 |
else:
|
@@ -47,12 +48,24 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
47 |
|
48 |
if data_subset == "OWN Recoding/Sample":
|
49 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
|
|
|
|
|
|
|
|
50 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
|
|
|
|
|
|
|
|
|
|
51 |
transcription2 = model_compute(model2, processor2, sample, model_2)
|
|
|
|
|
|
|
52 |
|
53 |
transcriptions1 = [transcription1]
|
54 |
transcriptions2 = [transcription2]
|
55 |
-
references = [own_transcription]
|
56 |
|
57 |
wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
|
58 |
wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
|
@@ -60,9 +73,11 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
60 |
results_md = f"""
|
61 |
#### {model_1}
|
62 |
- WER Score: {wer1}
|
|
|
63 |
|
64 |
#### {model_2}
|
65 |
-
- WER Score: {wer2}
|
|
|
66 |
|
67 |
# Create the bar plot
|
68 |
fig = go.Figure(
|
@@ -89,6 +104,8 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
89 |
transcriptions2 = []
|
90 |
WER1s = []
|
91 |
WER2s = []
|
|
|
|
|
92 |
|
93 |
|
94 |
counter = 0
|
@@ -99,34 +116,51 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
99 |
references.append(sample[text_column])
|
100 |
|
101 |
if model_1 == model_2:
|
|
|
102 |
transcription = model_compute(model1, processor1, sample, model_1)
|
103 |
-
|
|
|
|
|
|
|
104 |
transcriptions1.append(transcription)
|
105 |
transcriptions2.append(transcription)
|
106 |
else:
|
|
|
107 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
108 |
-
|
|
|
|
|
109 |
transcriptions1.append(transcription1)
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
transcriptions2.append(transcription2)
|
111 |
|
112 |
-
WER1s.append(compute_wer([sample[text_column]], [transcription1]))
|
113 |
-
WER2s.append(compute_wer([sample[text_column]], [transcription2]))
|
|
|
|
|
114 |
|
115 |
|
116 |
results_md = f"""
|
117 |
{i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
|
118 |
|
119 |
#### {model_1}
|
120 |
-
- WER Score: {
|
|
|
121 |
|
122 |
#### {model_2}
|
123 |
-
- WER Score: {
|
|
|
124 |
|
125 |
# Create the bar plot
|
126 |
fig = go.Figure(
|
127 |
data=[
|
128 |
-
go.Bar(x=[f"{model_1}"], y=[
|
129 |
-
go.Bar(x=[f"{model_2}"], y=[
|
130 |
]
|
131 |
)
|
132 |
|
@@ -138,7 +172,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
138 |
barmode="group",
|
139 |
)
|
140 |
|
141 |
-
df = pd.DataFrame({"references":references, "
|
142 |
|
143 |
yield results_md, fig, df
|
144 |
|
@@ -156,32 +190,19 @@ def load_Common_Voice():
|
|
156 |
sample["text"] = sample["text"].lower()
|
157 |
return dataset, text_column
|
158 |
|
159 |
-
def
|
160 |
-
|
161 |
-
dataset = load_dataset("facebook/voxpopuli", "en", split="test", streaming=True, trust_remote_code=True)
|
162 |
-
|
163 |
-
# Optionally, preview the first item to understand the structure (can be removed in production)
|
164 |
print(next(iter(dataset)))
|
165 |
-
|
166 |
-
# Take the first 120 examples to work with
|
167 |
-
dataset = dataset.take(N_SAMPLES+20)
|
168 |
-
text_column = "normalized_text"
|
169 |
-
|
170 |
-
# Filter out samples with empty or unwanted 'normalized_text' values and invalid audio
|
171 |
-
dataset = dataset.filter(lambda x: is_valid_sample(x[text_column], x['audio']))
|
172 |
-
|
173 |
-
# Take the first 100 examples after filtering
|
174 |
dataset = dataset.take(N_SAMPLES)
|
175 |
-
|
176 |
-
# Cast the 'audio' column to the desired sampling rate
|
177 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
178 |
-
|
179 |
-
# Convert to list and return
|
180 |
dataset = list(dataset)
|
|
|
|
|
181 |
return dataset, text_column
|
182 |
|
183 |
-
def
|
184 |
-
dataset = load_dataset("librispeech_asr", "
|
185 |
print(next(iter(dataset)))
|
186 |
text_column = "text"
|
187 |
dataset = dataset.take(N_SAMPLES)
|
@@ -191,22 +212,6 @@ def load_Librispeech_ASR_clean():
|
|
191 |
sample["text"] = sample["text"].lower()
|
192 |
return dataset, text_column
|
193 |
|
194 |
-
def is_valid_sample(text, audio):
|
195 |
-
# Check if 'normalized_text' is valid
|
196 |
-
text = text.strip()
|
197 |
-
if text == "" or text == "ignore time segment in scoring":
|
198 |
-
return False
|
199 |
-
|
200 |
-
# Check if the 'audio' array is valid (not empty and meets length criteria)
|
201 |
-
if len(audio['array']) == 0: # Audio is empty
|
202 |
-
return False
|
203 |
-
|
204 |
-
# Optionally, check if the audio duration is within a certain range
|
205 |
-
duration = audio['array'].size / audio['sampling_rate']
|
206 |
-
if duration < 1.0 or duration > 60.0: # Example: Filter out audio shorter than 1 second or longer than 60 seconds
|
207 |
-
return False
|
208 |
-
|
209 |
-
return True
|
210 |
|
211 |
|
212 |
# MODEL LOADERS
|
|
|
9 |
import torch
|
10 |
import numpy as np
|
11 |
import pandas as pd
|
12 |
+
import time
|
13 |
|
14 |
N_SAMPLES = 30
|
15 |
|
|
|
26 |
|
27 |
if data_subset == "Common Voice":
|
28 |
dataset, text_column = load_Common_Voice()
|
|
|
|
|
29 |
elif data_subset == "Librispeech ASR clean":
|
30 |
dataset, text_column = load_Librispeech_ASR_clean()
|
31 |
+
elif data_subset == "Librispeech ASR other":
|
32 |
+
dataset, text_column = load_Librispeech_ASR_other()
|
33 |
elif data_subset == "OWN Recoding/Sample":
|
34 |
sr, audio = own_audio
|
35 |
+
audio = audio.astype(np.float32)
|
36 |
print("AUDIO: ", type(audio), audio)
|
37 |
audio = librosa.resample(audio, orig_sr=sr, target_sr=16000)
|
38 |
else:
|
|
|
48 |
|
49 |
if data_subset == "OWN Recoding/Sample":
|
50 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
51 |
+
inference_times1 = []
|
52 |
+
inference_times2 = []
|
53 |
+
|
54 |
+
time_start = time.time()
|
55 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
56 |
+
time_stop = time.time()
|
57 |
+
duration = time_stop - time_start
|
58 |
+
inference_times1.append(duration)
|
59 |
+
|
60 |
+
time_start = time.time()
|
61 |
transcription2 = model_compute(model2, processor2, sample, model_2)
|
62 |
+
time_stop = time.time()
|
63 |
+
duration = time_stop - time_start
|
64 |
+
inference_times2.append(duration)
|
65 |
|
66 |
transcriptions1 = [transcription1]
|
67 |
transcriptions2 = [transcription2]
|
68 |
+
references = [own_transcription.lower()]
|
69 |
|
70 |
wer1 = round(N_SAMPLES * compute_wer(references, transcriptions1), 2)
|
71 |
wer2 = round(N_SAMPLES * compute_wer(references, transcriptions2), 2)
|
|
|
73 |
results_md = f"""
|
74 |
#### {model_1}
|
75 |
- WER Score: {wer1}
|
76 |
+
- Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
|
77 |
|
78 |
#### {model_2}
|
79 |
+
- WER Score: {wer2}
|
80 |
+
- Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
|
81 |
|
82 |
# Create the bar plot
|
83 |
fig = go.Figure(
|
|
|
104 |
transcriptions2 = []
|
105 |
WER1s = []
|
106 |
WER2s = []
|
107 |
+
inference_times1 = []
|
108 |
+
inference_times2 = []
|
109 |
|
110 |
|
111 |
counter = 0
|
|
|
116 |
references.append(sample[text_column])
|
117 |
|
118 |
if model_1 == model_2:
|
119 |
+
time_start = time.time()
|
120 |
transcription = model_compute(model1, processor1, sample, model_1)
|
121 |
+
time_stop = time.time()
|
122 |
+
duration = time_stop - time_start
|
123 |
+
inference_times1.append(duration)
|
124 |
+
inference_times2.append(duration)
|
125 |
transcriptions1.append(transcription)
|
126 |
transcriptions2.append(transcription)
|
127 |
else:
|
128 |
+
time_start = time.time()
|
129 |
transcription1 = model_compute(model1, processor1, sample, model_1)
|
130 |
+
time_stop = time.time()
|
131 |
+
duration = time_stop - time_start
|
132 |
+
inference_times1.append(duration)
|
133 |
transcriptions1.append(transcription1)
|
134 |
+
|
135 |
+
time_start = time.time()
|
136 |
+
transcription2 = model_compute(model2, processor2, sample, model_2)
|
137 |
+
time_stop = time.time()
|
138 |
+
duration = time_stop - time_start
|
139 |
+
inference_times2.append(duration)
|
140 |
transcriptions2.append(transcription2)
|
141 |
|
142 |
+
WER1s.append(round(compute_wer([sample[text_column]], [transcription1]),4))
|
143 |
+
WER2s.append(round(compute_wer([sample[text_column]], [transcription2]),4))
|
144 |
+
wer1 = round(sum(WER1s)/len(WER1s), 4)
|
145 |
+
wer2 = round(sum(WER2s)/len(WER2s), 4)
|
146 |
|
147 |
|
148 |
results_md = f"""
|
149 |
{i}/{len(dataset)}-{'#'*i}{'_'*(N_SAMPLES-i)}
|
150 |
|
151 |
#### {model_1}
|
152 |
+
- WER Score: {wer1}
|
153 |
+
- Avg. Inference Duration: {round(sum(inference_times1)/len(inference_times1), 4)}s
|
154 |
|
155 |
#### {model_2}
|
156 |
+
- WER Score: {wer2}
|
157 |
+
- Avg. Inference Duration: {round(sum(inference_times2)/len(inference_times2), 4)}s"""
|
158 |
|
159 |
# Create the bar plot
|
160 |
fig = go.Figure(
|
161 |
data=[
|
162 |
+
go.Bar(x=[f"{model_1}"], y=[wer1], showlegend=False),
|
163 |
+
go.Bar(x=[f"{model_2}"], y=[wer2], showlegend=False),
|
164 |
]
|
165 |
)
|
166 |
|
|
|
172 |
barmode="group",
|
173 |
)
|
174 |
|
175 |
+
df = pd.DataFrame({"references":references, f"{model_1}":transcriptions1,"WER 1":WER1s,f"{model_2}":transcriptions2,"WER 2":WER2s})
|
176 |
|
177 |
yield results_md, fig, df
|
178 |
|
|
|
190 |
sample["text"] = sample["text"].lower()
|
191 |
return dataset, text_column
|
192 |
|
193 |
+
def load_Librispeech_ASR_clean():
|
194 |
+
dataset = load_dataset("librispeech_asr", "clean", split="test", streaming=True, token=True, trust_remote_code=True)
|
|
|
|
|
|
|
195 |
print(next(iter(dataset)))
|
196 |
+
text_column = "text"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
dataset = dataset.take(N_SAMPLES)
|
|
|
|
|
198 |
dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
|
|
|
|
|
199 |
dataset = list(dataset)
|
200 |
+
for sample in dataset:
|
201 |
+
sample["text"] = sample["text"].lower()
|
202 |
return dataset, text_column
|
203 |
|
204 |
+
def load_Librispeech_ASR_other():
|
205 |
+
dataset = load_dataset("librispeech_asr", "other", split="test", streaming=True, token=True, trust_remote_code=True)
|
206 |
print(next(iter(dataset)))
|
207 |
text_column = "text"
|
208 |
dataset = dataset.take(N_SAMPLES)
|
|
|
212 |
sample["text"] = sample["text"].lower()
|
213 |
return dataset, text_column
|
214 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
215 |
|
216 |
|
217 |
# MODEL LOADERS
|