j-tobias commited on
Commit
15f66cd
1 Parent(s): 234fe59

latest changes

Browse files
Files changed (4) hide show
  1. app.py +41 -24
  2. cards.txt +4 -2
  3. model_evaluation.ipynb +278 -0
  4. processing.py +33 -11
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  from processing import run
3
- import json
4
  from huggingface_hub import login
5
  import os
6
 
@@ -11,6 +11,7 @@ import os
11
  hf_token = os.getenv("HF_Token")
12
  login(hf_token)
13
 
 
14
  # def hf_login():
15
  # hf_token = os.getenv("HF_Token")
16
  # if hf_token is None:
@@ -26,10 +27,13 @@ login(hf_token)
26
 
27
  # GENERAL OPTIONS FOR MODELS AND DATASETS
28
  MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
29
- DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
30
 
31
  # HELPER FUNCTIONS
32
  def get_card(selected_model:str)->str:
 
 
 
33
 
34
  with open("cards.txt", "r") as f:
35
  cards = f.read()
@@ -42,37 +46,38 @@ def get_card(selected_model:str)->str:
42
  return "Unknown Model"
43
 
44
  def is_own(selected_option):
45
- if selected_option == "OWN Recoding/Sample":
 
 
 
46
  return gr.update(visible=True), gr.update(visible=True)
47
  else:
48
  return gr.update(visible=False), gr.update(visible=False)
49
 
50
  def make_visible():
 
 
 
51
  return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
52
 
53
 
 
 
54
 
 
 
 
 
 
55
 
 
56
 
57
  # THE ACTUAL APP
58
  with gr.Blocks() as demo:
59
 
60
 
61
  gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
62
- gr.Markdown("""
63
-
64
- """)
65
-
66
-
67
- gr.Markdown("""### Welcome to ASR Model Comparison Hub! 🎉
68
-
69
- Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
70
-
71
- Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
72
-
73
- ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
74
-
75
- Happy experimenting and comparing! 🚀""")
76
 
77
 
78
 
@@ -80,13 +85,17 @@ Happy experimenting and comparing! 🚀""")
80
  with gr.Column(scale=1):
81
  pass
82
  with gr.Column(scale=5):
 
83
  data_subset = gr.Radio(
84
  value="Common Voice",
85
  choices=DATASET_OPTIONS,
86
  label="Data subset / Own Sample",
87
  )
88
- own_audio = gr.Audio(sources=['microphone'], visible=False)
89
- own_transcription = gr.TextArea(lines=2, visible=False)
 
 
 
90
  data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
91
  with gr.Column(scale=1):
92
  pass
@@ -94,31 +103,37 @@ Happy experimenting and comparing! 🚀""")
94
 
95
  with gr.Row():
96
 
 
97
  with gr.Column(scale=1):
98
  model_1 = gr.Dropdown(
99
  choices=MODEL_OPTIONS,
100
- label="Select Model"
101
  )
102
  model_1_card = gr.Markdown("")
103
 
 
104
  with gr.Column(scale=1):
105
  model_2 = gr.Dropdown(
106
  choices=MODEL_OPTIONS,
107
- label="Select Model"
108
  )
109
  model_2_card = gr.Markdown("")
110
 
111
-
112
  model_1.change(get_card, inputs=model_1, outputs=model_1_card)
113
  model_2.change(get_card, inputs=model_2, outputs=model_2_card)
114
 
115
-
116
  eval_btn = gr.Button(
117
  value="Evaluate",
118
  variant="primary",
119
  size="sm")
120
 
121
- results_title = gr.Markdown('## <p style="text-align: center;">Results</p>', visible=False)
 
 
 
 
122
  results_md = gr.Markdown("")
123
  results_plot = gr.Plot(show_label=False, visible=False)
124
  results_df = gr.DataFrame(
@@ -127,6 +142,8 @@ Happy experimenting and comparing! 🚀""")
127
  interactive=False, # Allow users to interact with the DataFrame
128
  wrap=True, # Ensure text wraps to multiple lines
129
  )
 
 
130
  eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
131
  eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
132
 
 
1
  import gradio as gr
2
  from processing import run
3
+ import json # is only used if hf_login() is used
4
  from huggingface_hub import login
5
  import os
6
 
 
11
  hf_token = os.getenv("HF_Token")
12
  login(hf_token)
13
 
14
+ # I have used this function for logging into HF using a credentials file
15
  # def hf_login():
16
  # hf_token = os.getenv("HF_Token")
17
  # if hf_token is None:
 
27
 
28
  # GENERAL OPTIONS FOR MODELS AND DATASETS
29
  MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
30
+ DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
31
 
32
  # HELPER FUNCTIONS
33
  def get_card(selected_model:str)->str:
34
+ """
35
+ This function retrieves the markdown text displayed for each selected Model
36
+ """
37
 
38
  with open("cards.txt", "r") as f:
39
  cards = f.read()
 
46
  return "Unknown Model"
47
 
48
  def is_own(selected_option):
49
+ """
50
+ In case the User wants to record an own Sample, this function makes the Components visible
51
+ """
52
+ if selected_option == "OWN Recording/Sample":
53
  return gr.update(visible=True), gr.update(visible=True)
54
  else:
55
  return gr.update(visible=False), gr.update(visible=False)
56
 
57
  def make_visible():
58
+ """
59
+ This function makes the Components needed for displaying the Results visible
60
+ """
61
  return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
62
 
63
 
64
+ # Introduction and Information about the APP
65
+ INTRODUCTION = """### Welcome to ASR Model Comparison Hub! 🎉
66
 
67
+ Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
68
+
69
+ Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
70
+
71
+ ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
72
 
73
+ Happy experimenting and comparing! 🚀"""
74
 
75
  # THE ACTUAL APP
76
  with gr.Blocks() as demo:
77
 
78
 
79
  gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
80
+ gr.Markdown(INTRODUCTION)
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
82
 
83
 
 
85
  with gr.Column(scale=1):
86
  pass
87
  with gr.Column(scale=5):
88
+ # Select a Dataset to evaluate the Models on
89
  data_subset = gr.Radio(
90
  value="Common Voice",
91
  choices=DATASET_OPTIONS,
92
  label="Data subset / Own Sample",
93
  )
94
+ # Components used to record an own sample
95
+ own_audio = gr.Audio(sources=['microphone'], visible=False, label=None)
96
+ own_transcription = gr.TextArea(lines=2, visible=False, label=None)
97
+
98
+ # Event Listiner to display the correct components
99
  data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
100
  with gr.Column(scale=1):
101
  pass
 
103
 
104
  with gr.Row():
105
 
106
+ # This Column is for selecting the First Model
107
  with gr.Column(scale=1):
108
  model_1 = gr.Dropdown(
109
  choices=MODEL_OPTIONS,
110
+ label=None
111
  )
112
  model_1_card = gr.Markdown("")
113
 
114
+ # This Columnis for selecting the Second Model
115
  with gr.Column(scale=1):
116
  model_2 = gr.Dropdown(
117
  choices=MODEL_OPTIONS,
118
+ label=None
119
  )
120
  model_2_card = gr.Markdown("")
121
 
122
+ # Event Listiners if a model has been selected
123
  model_1.change(get_card, inputs=model_1, outputs=model_1_card)
124
  model_2.change(get_card, inputs=model_2, outputs=model_2_card)
125
 
126
+ # Main Action Button to start the Evaluation
127
  eval_btn = gr.Button(
128
  value="Evaluate",
129
  variant="primary",
130
  size="sm")
131
 
132
+ # This Section Displays the Evaluation Results
133
+ results_title = gr.Markdown(
134
+ '## <p style="text-align: center;">Results</p>',
135
+ visible=False
136
+ )
137
  results_md = gr.Markdown("")
138
  results_plot = gr.Plot(show_label=False, visible=False)
139
  results_df = gr.DataFrame(
 
142
  interactive=False, # Allow users to interact with the DataFrame
143
  wrap=True, # Ensure text wraps to multiple lines
144
  )
145
+
146
+ # Event Listeners if the main aaction button has been trigered
147
  eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
148
  eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
149
 
cards.txt CHANGED
@@ -25,11 +25,13 @@
25
  - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
26
  - Training Data: ?
27
  @@
28
- #### Whisper Large v2
29
  - ID: openai/whisper-large-v2
30
  - Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
31
  - Creator: openai
32
  - Finetuned: No
33
  - Model Size: 1.54 B Parameters
34
  - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
35
- - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
 
 
 
25
  - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
26
  - Training Data: ?
27
  @@
28
+ #### Whisper Large v2
29
  - ID: openai/whisper-large-v2
30
  - Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
31
  - Creator: openai
32
  - Finetuned: No
33
  - Model Size: 1.54 B Parameters
34
  - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
35
+ - Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
36
+
37
+ (evaluating this model might take a while due to it's size)
model_evaluation.ipynb ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "metadata": {
6
+ "id": "H1UloQj623Ik"
7
+ },
8
+ "source": [
9
+ "## Model Evaluation\n",
10
+ "\n",
11
+ "Hi, there welcome to my notebook! 👋\n",
12
+ "\n",
13
+ "This notebook is all about evaluating different models using a small subset of a larger Dataset.\n",
14
+ "\n",
15
+ "This Notebook is self contained meaning that expect for installing necessary libraries you can run all cells in order and everything should work\n",
16
+ "If not, feel free to leave me a message and i'll give my best to fix the issue\n",
17
+ "\n",
18
+ "All you need for this notebook to work is a **HuggingFace token**.\n",
19
+ "\n",
20
+ "If you don't know how to find it.\n",
21
+ "\n",
22
+ "Go to your Hugging Face\n",
23
+ "> Profile -> Settings -> Access Tokens -> + Create new token\n",
24
+ "\n",
25
+ "You can find the Notebook in Google Colab [here](https://colab.research.google.com/drive/1awfo4_Llrg-aypEc_MdJXcqQMj3r_Fy2?usp=share_link)"
26
+ ]
27
+ },
28
+ {
29
+ "cell_type": "markdown",
30
+ "metadata": {
31
+ "id": "hDqZY8i85pOj"
32
+ },
33
+ "source": [
34
+ "### 1. Import all necessary libraries"
35
+ ]
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "execution_count": null,
40
+ "metadata": {
41
+ "id": "iw-5LI1u2x7a"
42
+ },
43
+ "outputs": [],
44
+ "source": [
45
+ "from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor\n",
46
+ "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
47
+ "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n",
48
+ "from huggingface_hub import login\n",
49
+ "from datasets import load_dataset\n",
50
+ "from datasets import Audio\n",
51
+ "from tqdm import tqdm\n",
52
+ "import evaluate\n",
53
+ "import torch"
54
+ ]
55
+ },
56
+ {
57
+ "cell_type": "markdown",
58
+ "metadata": {
59
+ "id": "gc4FRXzm5oTt"
60
+ },
61
+ "source": [
62
+ "### 2. Log in & set constants"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "metadata": {
69
+ "id": "6qTB32KR56lK"
70
+ },
71
+ "outputs": [],
72
+ "source": [
73
+ "# Login\n",
74
+ "login(\"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\")\n",
75
+ "\n",
76
+ "# Set constants\n",
77
+ "N_SAMPLES = 100"
78
+ ]
79
+ },
80
+ {
81
+ "cell_type": "markdown",
82
+ "metadata": {
83
+ "id": "vdZmlee66ItN"
84
+ },
85
+ "source": [
86
+ "### 3. Load Dataset & Metric"
87
+ ]
88
+ },
89
+ {
90
+ "cell_type": "code",
91
+ "execution_count": null,
92
+ "metadata": {
93
+ "id": "u4MDh9HA6QwF"
94
+ },
95
+ "outputs": [],
96
+ "source": [
97
+ "# Load the Dataset\n",
98
+ "dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True, token=True, trust_remote_code=True)\n",
99
+ "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n",
100
+ "dataset = dataset.take(N_SAMPLES)\n",
101
+ "\n",
102
+ "# Load the Evaluation Metric\n",
103
+ "wer_metric = evaluate.load(\"wer\")\n",
104
+ "\n",
105
+ "# Create Dictionary to Store Results\n",
106
+ "results = {\n",
107
+ " \"facebook/wav2vec2-base-960h\":0,\n",
108
+ " \"openai/whisper-tiny.en\":0,\n",
109
+ " \"facebook/s2t-medium-librispeech-asr\":0\n",
110
+ "}"
111
+ ]
112
+ },
113
+ {
114
+ "cell_type": "markdown",
115
+ "metadata": {
116
+ "id": "JDRzDiZ86XEa"
117
+ },
118
+ "source": [
119
+ "### 4. Evaluate the first Model"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": null,
125
+ "metadata": {
126
+ "id": "tNWLJ6bp6bnc"
127
+ },
128
+ "outputs": [],
129
+ "source": [
130
+ "# Load the 1. ASR Model\n",
131
+ "processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
132
+ "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
133
+ "\n",
134
+ "\n",
135
+ "# Run Inference For the First Model\n",
136
+ "predictions = []\n",
137
+ "references = []\n",
138
+ "\n",
139
+ "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
140
+ " input_values = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_values # Batch size 1\n",
141
+ " logits = model(input_values).logits\n",
142
+ " predicted_ids = torch.argmax(logits, dim=-1)\n",
143
+ " transcription = processor.batch_decode(predicted_ids)\n",
144
+ " predictions.append(transcription[0])\n",
145
+ " references.append(item[\"text\"])\n",
146
+ "\n",
147
+ "\n",
148
+ "\n",
149
+ "wer = wer_metric.compute(references=references, predictions=predictions)\n",
150
+ "wer = round(100 * wer, 2)\n",
151
+ "print(\"WER:\", wer)\n",
152
+ "results[\"facebook/wav2vec2-base-960h\"] = wer"
153
+ ]
154
+ },
155
+ {
156
+ "cell_type": "markdown",
157
+ "metadata": {
158
+ "id": "LObMf9h-6eo_"
159
+ },
160
+ "source": [
161
+ "### 5. Evaluate the second Model\n"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": null,
167
+ "metadata": {
168
+ "id": "kslHlHA86okx"
169
+ },
170
+ "outputs": [],
171
+ "source": [
172
+ "# Load the 2. ASR Model\n",
173
+ "processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny.en\")\n",
174
+ "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny.en\")\n",
175
+ "\n",
176
+ "\n",
177
+ "# Run Inference For the First Model\n",
178
+ "predictions = []\n",
179
+ "references = []\n",
180
+ "\n",
181
+ "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
182
+ " input_features = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_features # Batch size 1\n",
183
+ " predicted_ids = model.generate(input_features=input_features)\n",
184
+ " transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
185
+ " predictions.append(processor.tokenizer.normalize(transcription[0]))\n",
186
+ " references.append(processor.tokenizer.normalize(item[\"text\"]))\n",
187
+ "\n",
188
+ "\n",
189
+ "\n",
190
+ "wer = wer_metric.compute(references=references, predictions=predictions)\n",
191
+ "wer = round(100 * wer, 2)\n",
192
+ "print(\"WER:\", wer)\n",
193
+ "results[\"openai/whisper-tiny.en\"] = wer"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "markdown",
198
+ "metadata": {
199
+ "id": "VXKxHUFi6puQ"
200
+ },
201
+ "source": [
202
+ "### 6. Evaluate the third Model"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": null,
208
+ "metadata": {
209
+ "id": "mKQgkwnf6vVM"
210
+ },
211
+ "outputs": [],
212
+ "source": [
213
+ "# Load the 3. ASR Model\n",
214
+ "model = Speech2TextForConditionalGeneration.from_pretrained(\"facebook/s2t-medium-librispeech-asr\")\n",
215
+ "processor = Speech2TextProcessor.from_pretrained(\"facebook/s2t-medium-librispeech-asr\", do_upper_case=True)\n",
216
+ "\n",
217
+ "\n",
218
+ "# Run Inference For the First Model\n",
219
+ "predictions = []\n",
220
+ "references = []\n",
221
+ "\n",
222
+ "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
223
+ " sample = item[\"audio\"]\n",
224
+ " features = processor(sample[\"array\"], sampling_rate=16000, padding=True, return_tensors=\"pt\")\n",
225
+ " input_features = features.input_features\n",
226
+ " attention_mask = features.attention_mask\n",
227
+ " gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)\n",
228
+ " transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)\n",
229
+ " predictions.append(transcription[0])\n",
230
+ " references.append(item[\"text\"])\n",
231
+ "\n",
232
+ "\n",
233
+ "\n",
234
+ "wer = wer_metric.compute(references=references, predictions=predictions)\n",
235
+ "wer = round(100 * wer, 2)\n",
236
+ "print(\"WER:\", wer)\n",
237
+ "results[\"facebook/s2t-medium-librispeech-asr\"] = wer"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "markdown",
242
+ "metadata": {
243
+ "id": "D413vLho6v_v"
244
+ },
245
+ "source": [
246
+ "### 7. Find the winning Model"
247
+ ]
248
+ },
249
+ {
250
+ "cell_type": "code",
251
+ "execution_count": null,
252
+ "metadata": {
253
+ "id": "pAlJylIB60pL"
254
+ },
255
+ "outputs": [],
256
+ "source": [
257
+ "winning_model = min(results, key=results.get)\n",
258
+ "min_wer = results[winning_model]\n",
259
+ "\n",
260
+ "print(f\"The model {winning_model} has the lowest WER Score achieved with WER: {min_wer}\")"
261
+ ]
262
+ }
263
+ ],
264
+ "metadata": {
265
+ "colab": {
266
+ "provenance": []
267
+ },
268
+ "kernelspec": {
269
+ "display_name": "Python 3",
270
+ "name": "python3"
271
+ },
272
+ "language_info": {
273
+ "name": "python"
274
+ }
275
+ },
276
+ "nbformat": 4,
277
+ "nbformat_minor": 0
278
+ }
processing.py CHANGED
@@ -1,9 +1,14 @@
 
1
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
3
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
4
- import plotly.graph_objs as go
 
5
  from datasets import load_dataset
6
  from datasets import Audio
 
 
 
7
  import evaluate
8
  import librosa
9
  import torch
@@ -11,26 +16,41 @@ import numpy as np
11
  import pandas as pd
12
  import time
13
 
14
- N_SAMPLES = 30
 
15
 
 
16
  wer_metric = evaluate.load("wer")
17
 
18
- def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
19
 
20
- if data_subset is None:
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  raise ValueError("No Dataset selected")
22
  if model_1 is None:
23
  raise ValueError("No Model 1 selected")
24
  if model_2 is None:
25
  raise ValueError("No Model 2 selected")
26
 
 
27
  if data_subset == "Common Voice":
28
  dataset, text_column = load_Common_Voice()
29
  elif data_subset == "Librispeech ASR clean":
30
  dataset, text_column = load_Librispeech_ASR_clean()
31
  elif data_subset == "Librispeech ASR other":
32
  dataset, text_column = load_Librispeech_ASR_other()
33
- elif data_subset == "OWN Recoding/Sample":
34
  sr, audio = own_audio
35
  audio = audio.astype(np.float32)
36
  print("AUDIO: ", type(audio), audio)
@@ -38,15 +58,16 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
38
  else:
39
  # if data_subset is None then still load load_Common_Voice
40
  dataset, text_column = load_Common_Voice()
41
- print("Dataset Loaded")
42
 
 
 
 
43
  model1, processor1 = load_model(model_1)
44
  model2, processor2 = load_model(model_2)
45
  print("Models Loaded")
46
 
47
-
48
-
49
- if data_subset == "OWN Recoding/Sample":
50
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
51
  inference_times1 = []
52
  inference_times2 = []
@@ -98,6 +119,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
98
 
99
  yield results_md, fig, df
100
 
 
101
  else:
102
  references = []
103
  transcriptions1 = []
@@ -229,7 +251,7 @@ def load_model(model_id:str):
229
  processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
230
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
231
  model.config.forced_decoder_ids = None
232
- else:
233
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
234
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
235
 
@@ -269,7 +291,7 @@ def model_compute(model, processor, sample, model_id):
269
  transcription = processor.tokenizer.normalize(transcription[0])
270
  print("TRANSCRIPTION Whisper Large v2: ", transcription)
271
  return transcription
272
- else:
273
  sample = sample["audio"]
274
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
275
  predicted_ids = model.generate(input_features)
 
1
+ # Import Libraries to load Models
2
  from transformers import WhisperProcessor, WhisperForConditionalGeneration
3
  from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
4
  from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
5
+
6
+ # Import Libraries to access Datasets
7
  from datasets import load_dataset
8
  from datasets import Audio
9
+
10
+ # Helper Libraries
11
+ import plotly.graph_objs as go
12
  import evaluate
13
  import librosa
14
  import torch
 
16
  import pandas as pd
17
  import time
18
 
19
+ # This constant determines on how many samples the Models are evaluated on
20
+ N_SAMPLES = 50
21
 
22
+ # Load the WER Metric
23
  wer_metric = evaluate.load("wer")
24
 
 
25
 
26
+ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
27
+ """
28
+ Main Function running an entire evaluation cycle
29
+
30
+ Params:
31
+ - data_subset (str) :The name of a valid Dataset to choose from ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
32
+ - model_1 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
33
+ - model_2 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
34
+ - own_audio (gr.Audio) :The return value of an gr.Audio component (sr, audio (as numpy array))
35
+ - own_transcription (str) :The paired transcription to the own_audio
36
+ """
37
+
38
+ # A little bit of Error Handling
39
+ if data_subset is None and own_audio is None and own_transcription is None:
40
  raise ValueError("No Dataset selected")
41
  if model_1 is None:
42
  raise ValueError("No Model 1 selected")
43
  if model_2 is None:
44
  raise ValueError("No Model 2 selected")
45
 
46
+ # Load the selected Dataset but only N_SAMPLES of it
47
  if data_subset == "Common Voice":
48
  dataset, text_column = load_Common_Voice()
49
  elif data_subset == "Librispeech ASR clean":
50
  dataset, text_column = load_Librispeech_ASR_clean()
51
  elif data_subset == "Librispeech ASR other":
52
  dataset, text_column = load_Librispeech_ASR_other()
53
+ elif data_subset == "OWN Recording/Sample":
54
  sr, audio = own_audio
55
  audio = audio.astype(np.float32)
56
  print("AUDIO: ", type(audio), audio)
 
58
  else:
59
  # if data_subset is None then still load load_Common_Voice
60
  dataset, text_column = load_Common_Voice()
 
61
 
62
+ # I have left the print statements because users have access to the logs in Spaces and this might help to understand what's going on
63
+ print("Dataset Loaded")
64
+ # Load the selected Models
65
  model1, processor1 = load_model(model_1)
66
  model2, processor2 = load_model(model_2)
67
  print("Models Loaded")
68
 
69
+ # In case a own Recording is selected only a single sample has to be evaluated
70
+ if data_subset == "OWN Recording/Sample":
 
71
  sample = {"audio":{"array":audio,"sampling_rate":16000}}
72
  inference_times1 = []
73
  inference_times2 = []
 
119
 
120
  yield results_md, fig, df
121
 
122
+ # In case a Dataset has been selected
123
  else:
124
  references = []
125
  transcriptions1 = []
 
251
  processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
252
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
253
  model.config.forced_decoder_ids = None
254
+ else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
255
  model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
256
  processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
257
 
 
291
  transcription = processor.tokenizer.normalize(transcription[0])
292
  print("TRANSCRIPTION Whisper Large v2: ", transcription)
293
  return transcription
294
+ else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
295
  sample = sample["audio"]
296
  input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
297
  predicted_ids = model.generate(input_features)