Spaces:
Sleeping
Sleeping
j-tobias
commited on
Commit
•
15f66cd
1
Parent(s):
234fe59
latest changes
Browse files- app.py +41 -24
- cards.txt +4 -2
- model_evaluation.ipynb +278 -0
- processing.py +33 -11
app.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
import gradio as gr
|
2 |
from processing import run
|
3 |
-
import json
|
4 |
from huggingface_hub import login
|
5 |
import os
|
6 |
|
@@ -11,6 +11,7 @@ import os
|
|
11 |
hf_token = os.getenv("HF_Token")
|
12 |
login(hf_token)
|
13 |
|
|
|
14 |
# def hf_login():
|
15 |
# hf_token = os.getenv("HF_Token")
|
16 |
# if hf_token is None:
|
@@ -26,10 +27,13 @@ login(hf_token)
|
|
26 |
|
27 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
28 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
29 |
-
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN
|
30 |
|
31 |
# HELPER FUNCTIONS
|
32 |
def get_card(selected_model:str)->str:
|
|
|
|
|
|
|
33 |
|
34 |
with open("cards.txt", "r") as f:
|
35 |
cards = f.read()
|
@@ -42,37 +46,38 @@ def get_card(selected_model:str)->str:
|
|
42 |
return "Unknown Model"
|
43 |
|
44 |
def is_own(selected_option):
|
45 |
-
|
|
|
|
|
|
|
46 |
return gr.update(visible=True), gr.update(visible=True)
|
47 |
else:
|
48 |
return gr.update(visible=False), gr.update(visible=False)
|
49 |
|
50 |
def make_visible():
|
|
|
|
|
|
|
51 |
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
52 |
|
53 |
|
|
|
|
|
54 |
|
|
|
|
|
|
|
|
|
|
|
55 |
|
|
|
56 |
|
57 |
# THE ACTUAL APP
|
58 |
with gr.Blocks() as demo:
|
59 |
|
60 |
|
61 |
gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
|
62 |
-
gr.Markdown(
|
63 |
-
|
64 |
-
""")
|
65 |
-
|
66 |
-
|
67 |
-
gr.Markdown("""### Welcome to ASR Model Comparison Hub! 🎉
|
68 |
-
|
69 |
-
Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
|
70 |
-
|
71 |
-
Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
|
72 |
-
|
73 |
-
ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
|
74 |
-
|
75 |
-
Happy experimenting and comparing! 🚀""")
|
76 |
|
77 |
|
78 |
|
@@ -80,13 +85,17 @@ Happy experimenting and comparing! 🚀""")
|
|
80 |
with gr.Column(scale=1):
|
81 |
pass
|
82 |
with gr.Column(scale=5):
|
|
|
83 |
data_subset = gr.Radio(
|
84 |
value="Common Voice",
|
85 |
choices=DATASET_OPTIONS,
|
86 |
label="Data subset / Own Sample",
|
87 |
)
|
88 |
-
|
89 |
-
|
|
|
|
|
|
|
90 |
data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
|
91 |
with gr.Column(scale=1):
|
92 |
pass
|
@@ -94,31 +103,37 @@ Happy experimenting and comparing! 🚀""")
|
|
94 |
|
95 |
with gr.Row():
|
96 |
|
|
|
97 |
with gr.Column(scale=1):
|
98 |
model_1 = gr.Dropdown(
|
99 |
choices=MODEL_OPTIONS,
|
100 |
-
label=
|
101 |
)
|
102 |
model_1_card = gr.Markdown("")
|
103 |
|
|
|
104 |
with gr.Column(scale=1):
|
105 |
model_2 = gr.Dropdown(
|
106 |
choices=MODEL_OPTIONS,
|
107 |
-
label=
|
108 |
)
|
109 |
model_2_card = gr.Markdown("")
|
110 |
|
111 |
-
|
112 |
model_1.change(get_card, inputs=model_1, outputs=model_1_card)
|
113 |
model_2.change(get_card, inputs=model_2, outputs=model_2_card)
|
114 |
|
115 |
-
|
116 |
eval_btn = gr.Button(
|
117 |
value="Evaluate",
|
118 |
variant="primary",
|
119 |
size="sm")
|
120 |
|
121 |
-
|
|
|
|
|
|
|
|
|
122 |
results_md = gr.Markdown("")
|
123 |
results_plot = gr.Plot(show_label=False, visible=False)
|
124 |
results_df = gr.DataFrame(
|
@@ -127,6 +142,8 @@ Happy experimenting and comparing! 🚀""")
|
|
127 |
interactive=False, # Allow users to interact with the DataFrame
|
128 |
wrap=True, # Ensure text wraps to multiple lines
|
129 |
)
|
|
|
|
|
130 |
eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
|
131 |
eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
|
132 |
|
|
|
1 |
import gradio as gr
|
2 |
from processing import run
|
3 |
+
import json # is only used if hf_login() is used
|
4 |
from huggingface_hub import login
|
5 |
import os
|
6 |
|
|
|
11 |
hf_token = os.getenv("HF_Token")
|
12 |
login(hf_token)
|
13 |
|
14 |
+
# I have used this function for logging into HF using a credentials file
|
15 |
# def hf_login():
|
16 |
# hf_token = os.getenv("HF_Token")
|
17 |
# if hf_token is None:
|
|
|
27 |
|
28 |
# GENERAL OPTIONS FOR MODELS AND DATASETS
|
29 |
MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
30 |
+
DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
31 |
|
32 |
# HELPER FUNCTIONS
|
33 |
def get_card(selected_model:str)->str:
|
34 |
+
"""
|
35 |
+
This function retrieves the markdown text displayed for each selected Model
|
36 |
+
"""
|
37 |
|
38 |
with open("cards.txt", "r") as f:
|
39 |
cards = f.read()
|
|
|
46 |
return "Unknown Model"
|
47 |
|
48 |
def is_own(selected_option):
|
49 |
+
"""
|
50 |
+
In case the User wants to record an own Sample, this function makes the Components visible
|
51 |
+
"""
|
52 |
+
if selected_option == "OWN Recording/Sample":
|
53 |
return gr.update(visible=True), gr.update(visible=True)
|
54 |
else:
|
55 |
return gr.update(visible=False), gr.update(visible=False)
|
56 |
|
57 |
def make_visible():
|
58 |
+
"""
|
59 |
+
This function makes the Components needed for displaying the Results visible
|
60 |
+
"""
|
61 |
return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
|
62 |
|
63 |
|
64 |
+
# Introduction and Information about the APP
|
65 |
+
INTRODUCTION = """### Welcome to ASR Model Comparison Hub! 🎉
|
66 |
|
67 |
+
Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
|
68 |
+
|
69 |
+
Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
|
70 |
+
|
71 |
+
ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
|
72 |
|
73 |
+
Happy experimenting and comparing! 🚀"""
|
74 |
|
75 |
# THE ACTUAL APP
|
76 |
with gr.Blocks() as demo:
|
77 |
|
78 |
|
79 |
gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
|
80 |
+
gr.Markdown(INTRODUCTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
|
82 |
|
83 |
|
|
|
85 |
with gr.Column(scale=1):
|
86 |
pass
|
87 |
with gr.Column(scale=5):
|
88 |
+
# Select a Dataset to evaluate the Models on
|
89 |
data_subset = gr.Radio(
|
90 |
value="Common Voice",
|
91 |
choices=DATASET_OPTIONS,
|
92 |
label="Data subset / Own Sample",
|
93 |
)
|
94 |
+
# Components used to record an own sample
|
95 |
+
own_audio = gr.Audio(sources=['microphone'], visible=False, label=None)
|
96 |
+
own_transcription = gr.TextArea(lines=2, visible=False, label=None)
|
97 |
+
|
98 |
+
# Event Listiner to display the correct components
|
99 |
data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
|
100 |
with gr.Column(scale=1):
|
101 |
pass
|
|
|
103 |
|
104 |
with gr.Row():
|
105 |
|
106 |
+
# This Column is for selecting the First Model
|
107 |
with gr.Column(scale=1):
|
108 |
model_1 = gr.Dropdown(
|
109 |
choices=MODEL_OPTIONS,
|
110 |
+
label=None
|
111 |
)
|
112 |
model_1_card = gr.Markdown("")
|
113 |
|
114 |
+
# This Columnis for selecting the Second Model
|
115 |
with gr.Column(scale=1):
|
116 |
model_2 = gr.Dropdown(
|
117 |
choices=MODEL_OPTIONS,
|
118 |
+
label=None
|
119 |
)
|
120 |
model_2_card = gr.Markdown("")
|
121 |
|
122 |
+
# Event Listiners if a model has been selected
|
123 |
model_1.change(get_card, inputs=model_1, outputs=model_1_card)
|
124 |
model_2.change(get_card, inputs=model_2, outputs=model_2_card)
|
125 |
|
126 |
+
# Main Action Button to start the Evaluation
|
127 |
eval_btn = gr.Button(
|
128 |
value="Evaluate",
|
129 |
variant="primary",
|
130 |
size="sm")
|
131 |
|
132 |
+
# This Section Displays the Evaluation Results
|
133 |
+
results_title = gr.Markdown(
|
134 |
+
'## <p style="text-align: center;">Results</p>',
|
135 |
+
visible=False
|
136 |
+
)
|
137 |
results_md = gr.Markdown("")
|
138 |
results_plot = gr.Plot(show_label=False, visible=False)
|
139 |
results_df = gr.DataFrame(
|
|
|
142 |
interactive=False, # Allow users to interact with the DataFrame
|
143 |
wrap=True, # Ensure text wraps to multiple lines
|
144 |
)
|
145 |
+
|
146 |
+
# Event Listeners if the main aaction button has been trigered
|
147 |
eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
|
148 |
eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)
|
149 |
|
cards.txt
CHANGED
@@ -25,11 +25,13 @@
|
|
25 |
- Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
|
26 |
- Training Data: ?
|
27 |
@@
|
28 |
-
#### Whisper Large v2
|
29 |
- ID: openai/whisper-large-v2
|
30 |
- Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
|
31 |
- Creator: openai
|
32 |
- Finetuned: No
|
33 |
- Model Size: 1.54 B Parameters
|
34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
35 |
-
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
|
|
|
|
|
25 |
- Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
|
26 |
- Training Data: ?
|
27 |
@@
|
28 |
+
#### Whisper Large v2
|
29 |
- ID: openai/whisper-large-v2
|
30 |
- Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
|
31 |
- Creator: openai
|
32 |
- Finetuned: No
|
33 |
- Model Size: 1.54 B Parameters
|
34 |
- Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
|
35 |
+
- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
|
36 |
+
|
37 |
+
(evaluating this model might take a while due to it's size)
|
model_evaluation.ipynb
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "H1UloQj623Ik"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"## Model Evaluation\n",
|
10 |
+
"\n",
|
11 |
+
"Hi, there welcome to my notebook! 👋\n",
|
12 |
+
"\n",
|
13 |
+
"This notebook is all about evaluating different models using a small subset of a larger Dataset.\n",
|
14 |
+
"\n",
|
15 |
+
"This Notebook is self contained meaning that expect for installing necessary libraries you can run all cells in order and everything should work\n",
|
16 |
+
"If not, feel free to leave me a message and i'll give my best to fix the issue\n",
|
17 |
+
"\n",
|
18 |
+
"All you need for this notebook to work is a **HuggingFace token**.\n",
|
19 |
+
"\n",
|
20 |
+
"If you don't know how to find it.\n",
|
21 |
+
"\n",
|
22 |
+
"Go to your Hugging Face\n",
|
23 |
+
"> Profile -> Settings -> Access Tokens -> + Create new token\n",
|
24 |
+
"\n",
|
25 |
+
"You can find the Notebook in Google Colab [here](https://colab.research.google.com/drive/1awfo4_Llrg-aypEc_MdJXcqQMj3r_Fy2?usp=share_link)"
|
26 |
+
]
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"cell_type": "markdown",
|
30 |
+
"metadata": {
|
31 |
+
"id": "hDqZY8i85pOj"
|
32 |
+
},
|
33 |
+
"source": [
|
34 |
+
"### 1. Import all necessary libraries"
|
35 |
+
]
|
36 |
+
},
|
37 |
+
{
|
38 |
+
"cell_type": "code",
|
39 |
+
"execution_count": null,
|
40 |
+
"metadata": {
|
41 |
+
"id": "iw-5LI1u2x7a"
|
42 |
+
},
|
43 |
+
"outputs": [],
|
44 |
+
"source": [
|
45 |
+
"from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor\n",
|
46 |
+
"from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
|
47 |
+
"from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n",
|
48 |
+
"from huggingface_hub import login\n",
|
49 |
+
"from datasets import load_dataset\n",
|
50 |
+
"from datasets import Audio\n",
|
51 |
+
"from tqdm import tqdm\n",
|
52 |
+
"import evaluate\n",
|
53 |
+
"import torch"
|
54 |
+
]
|
55 |
+
},
|
56 |
+
{
|
57 |
+
"cell_type": "markdown",
|
58 |
+
"metadata": {
|
59 |
+
"id": "gc4FRXzm5oTt"
|
60 |
+
},
|
61 |
+
"source": [
|
62 |
+
"### 2. Log in & set constants"
|
63 |
+
]
|
64 |
+
},
|
65 |
+
{
|
66 |
+
"cell_type": "code",
|
67 |
+
"execution_count": null,
|
68 |
+
"metadata": {
|
69 |
+
"id": "6qTB32KR56lK"
|
70 |
+
},
|
71 |
+
"outputs": [],
|
72 |
+
"source": [
|
73 |
+
"# Login\n",
|
74 |
+
"login(\"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\")\n",
|
75 |
+
"\n",
|
76 |
+
"# Set constants\n",
|
77 |
+
"N_SAMPLES = 100"
|
78 |
+
]
|
79 |
+
},
|
80 |
+
{
|
81 |
+
"cell_type": "markdown",
|
82 |
+
"metadata": {
|
83 |
+
"id": "vdZmlee66ItN"
|
84 |
+
},
|
85 |
+
"source": [
|
86 |
+
"### 3. Load Dataset & Metric"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": null,
|
92 |
+
"metadata": {
|
93 |
+
"id": "u4MDh9HA6QwF"
|
94 |
+
},
|
95 |
+
"outputs": [],
|
96 |
+
"source": [
|
97 |
+
"# Load the Dataset\n",
|
98 |
+
"dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True, token=True, trust_remote_code=True)\n",
|
99 |
+
"dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n",
|
100 |
+
"dataset = dataset.take(N_SAMPLES)\n",
|
101 |
+
"\n",
|
102 |
+
"# Load the Evaluation Metric\n",
|
103 |
+
"wer_metric = evaluate.load(\"wer\")\n",
|
104 |
+
"\n",
|
105 |
+
"# Create Dictionary to Store Results\n",
|
106 |
+
"results = {\n",
|
107 |
+
" \"facebook/wav2vec2-base-960h\":0,\n",
|
108 |
+
" \"openai/whisper-tiny.en\":0,\n",
|
109 |
+
" \"facebook/s2t-medium-librispeech-asr\":0\n",
|
110 |
+
"}"
|
111 |
+
]
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "markdown",
|
115 |
+
"metadata": {
|
116 |
+
"id": "JDRzDiZ86XEa"
|
117 |
+
},
|
118 |
+
"source": [
|
119 |
+
"### 4. Evaluate the first Model"
|
120 |
+
]
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"cell_type": "code",
|
124 |
+
"execution_count": null,
|
125 |
+
"metadata": {
|
126 |
+
"id": "tNWLJ6bp6bnc"
|
127 |
+
},
|
128 |
+
"outputs": [],
|
129 |
+
"source": [
|
130 |
+
"# Load the 1. ASR Model\n",
|
131 |
+
"processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
|
132 |
+
"model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
|
133 |
+
"\n",
|
134 |
+
"\n",
|
135 |
+
"# Run Inference For the First Model\n",
|
136 |
+
"predictions = []\n",
|
137 |
+
"references = []\n",
|
138 |
+
"\n",
|
139 |
+
"for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
|
140 |
+
" input_values = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_values # Batch size 1\n",
|
141 |
+
" logits = model(input_values).logits\n",
|
142 |
+
" predicted_ids = torch.argmax(logits, dim=-1)\n",
|
143 |
+
" transcription = processor.batch_decode(predicted_ids)\n",
|
144 |
+
" predictions.append(transcription[0])\n",
|
145 |
+
" references.append(item[\"text\"])\n",
|
146 |
+
"\n",
|
147 |
+
"\n",
|
148 |
+
"\n",
|
149 |
+
"wer = wer_metric.compute(references=references, predictions=predictions)\n",
|
150 |
+
"wer = round(100 * wer, 2)\n",
|
151 |
+
"print(\"WER:\", wer)\n",
|
152 |
+
"results[\"facebook/wav2vec2-base-960h\"] = wer"
|
153 |
+
]
|
154 |
+
},
|
155 |
+
{
|
156 |
+
"cell_type": "markdown",
|
157 |
+
"metadata": {
|
158 |
+
"id": "LObMf9h-6eo_"
|
159 |
+
},
|
160 |
+
"source": [
|
161 |
+
"### 5. Evaluate the second Model\n"
|
162 |
+
]
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"cell_type": "code",
|
166 |
+
"execution_count": null,
|
167 |
+
"metadata": {
|
168 |
+
"id": "kslHlHA86okx"
|
169 |
+
},
|
170 |
+
"outputs": [],
|
171 |
+
"source": [
|
172 |
+
"# Load the 2. ASR Model\n",
|
173 |
+
"processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny.en\")\n",
|
174 |
+
"model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny.en\")\n",
|
175 |
+
"\n",
|
176 |
+
"\n",
|
177 |
+
"# Run Inference For the First Model\n",
|
178 |
+
"predictions = []\n",
|
179 |
+
"references = []\n",
|
180 |
+
"\n",
|
181 |
+
"for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
|
182 |
+
" input_features = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_features # Batch size 1\n",
|
183 |
+
" predicted_ids = model.generate(input_features=input_features)\n",
|
184 |
+
" transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
|
185 |
+
" predictions.append(processor.tokenizer.normalize(transcription[0]))\n",
|
186 |
+
" references.append(processor.tokenizer.normalize(item[\"text\"]))\n",
|
187 |
+
"\n",
|
188 |
+
"\n",
|
189 |
+
"\n",
|
190 |
+
"wer = wer_metric.compute(references=references, predictions=predictions)\n",
|
191 |
+
"wer = round(100 * wer, 2)\n",
|
192 |
+
"print(\"WER:\", wer)\n",
|
193 |
+
"results[\"openai/whisper-tiny.en\"] = wer"
|
194 |
+
]
|
195 |
+
},
|
196 |
+
{
|
197 |
+
"cell_type": "markdown",
|
198 |
+
"metadata": {
|
199 |
+
"id": "VXKxHUFi6puQ"
|
200 |
+
},
|
201 |
+
"source": [
|
202 |
+
"### 6. Evaluate the third Model"
|
203 |
+
]
|
204 |
+
},
|
205 |
+
{
|
206 |
+
"cell_type": "code",
|
207 |
+
"execution_count": null,
|
208 |
+
"metadata": {
|
209 |
+
"id": "mKQgkwnf6vVM"
|
210 |
+
},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"# Load the 3. ASR Model\n",
|
214 |
+
"model = Speech2TextForConditionalGeneration.from_pretrained(\"facebook/s2t-medium-librispeech-asr\")\n",
|
215 |
+
"processor = Speech2TextProcessor.from_pretrained(\"facebook/s2t-medium-librispeech-asr\", do_upper_case=True)\n",
|
216 |
+
"\n",
|
217 |
+
"\n",
|
218 |
+
"# Run Inference For the First Model\n",
|
219 |
+
"predictions = []\n",
|
220 |
+
"references = []\n",
|
221 |
+
"\n",
|
222 |
+
"for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
|
223 |
+
" sample = item[\"audio\"]\n",
|
224 |
+
" features = processor(sample[\"array\"], sampling_rate=16000, padding=True, return_tensors=\"pt\")\n",
|
225 |
+
" input_features = features.input_features\n",
|
226 |
+
" attention_mask = features.attention_mask\n",
|
227 |
+
" gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)\n",
|
228 |
+
" transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)\n",
|
229 |
+
" predictions.append(transcription[0])\n",
|
230 |
+
" references.append(item[\"text\"])\n",
|
231 |
+
"\n",
|
232 |
+
"\n",
|
233 |
+
"\n",
|
234 |
+
"wer = wer_metric.compute(references=references, predictions=predictions)\n",
|
235 |
+
"wer = round(100 * wer, 2)\n",
|
236 |
+
"print(\"WER:\", wer)\n",
|
237 |
+
"results[\"facebook/s2t-medium-librispeech-asr\"] = wer"
|
238 |
+
]
|
239 |
+
},
|
240 |
+
{
|
241 |
+
"cell_type": "markdown",
|
242 |
+
"metadata": {
|
243 |
+
"id": "D413vLho6v_v"
|
244 |
+
},
|
245 |
+
"source": [
|
246 |
+
"### 7. Find the winning Model"
|
247 |
+
]
|
248 |
+
},
|
249 |
+
{
|
250 |
+
"cell_type": "code",
|
251 |
+
"execution_count": null,
|
252 |
+
"metadata": {
|
253 |
+
"id": "pAlJylIB60pL"
|
254 |
+
},
|
255 |
+
"outputs": [],
|
256 |
+
"source": [
|
257 |
+
"winning_model = min(results, key=results.get)\n",
|
258 |
+
"min_wer = results[winning_model]\n",
|
259 |
+
"\n",
|
260 |
+
"print(f\"The model {winning_model} has the lowest WER Score achieved with WER: {min_wer}\")"
|
261 |
+
]
|
262 |
+
}
|
263 |
+
],
|
264 |
+
"metadata": {
|
265 |
+
"colab": {
|
266 |
+
"provenance": []
|
267 |
+
},
|
268 |
+
"kernelspec": {
|
269 |
+
"display_name": "Python 3",
|
270 |
+
"name": "python3"
|
271 |
+
},
|
272 |
+
"language_info": {
|
273 |
+
"name": "python"
|
274 |
+
}
|
275 |
+
},
|
276 |
+
"nbformat": 4,
|
277 |
+
"nbformat_minor": 0
|
278 |
+
}
|
processing.py
CHANGED
@@ -1,9 +1,14 @@
|
|
|
|
1 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
2 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
3 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
4 |
-
|
|
|
5 |
from datasets import load_dataset
|
6 |
from datasets import Audio
|
|
|
|
|
|
|
7 |
import evaluate
|
8 |
import librosa
|
9 |
import torch
|
@@ -11,26 +16,41 @@ import numpy as np
|
|
11 |
import pandas as pd
|
12 |
import time
|
13 |
|
14 |
-
|
|
|
15 |
|
|
|
16 |
wer_metric = evaluate.load("wer")
|
17 |
|
18 |
-
def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
|
19 |
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
raise ValueError("No Dataset selected")
|
22 |
if model_1 is None:
|
23 |
raise ValueError("No Model 1 selected")
|
24 |
if model_2 is None:
|
25 |
raise ValueError("No Model 2 selected")
|
26 |
|
|
|
27 |
if data_subset == "Common Voice":
|
28 |
dataset, text_column = load_Common_Voice()
|
29 |
elif data_subset == "Librispeech ASR clean":
|
30 |
dataset, text_column = load_Librispeech_ASR_clean()
|
31 |
elif data_subset == "Librispeech ASR other":
|
32 |
dataset, text_column = load_Librispeech_ASR_other()
|
33 |
-
elif data_subset == "OWN
|
34 |
sr, audio = own_audio
|
35 |
audio = audio.astype(np.float32)
|
36 |
print("AUDIO: ", type(audio), audio)
|
@@ -38,15 +58,16 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
38 |
else:
|
39 |
# if data_subset is None then still load load_Common_Voice
|
40 |
dataset, text_column = load_Common_Voice()
|
41 |
-
print("Dataset Loaded")
|
42 |
|
|
|
|
|
|
|
43 |
model1, processor1 = load_model(model_1)
|
44 |
model2, processor2 = load_model(model_2)
|
45 |
print("Models Loaded")
|
46 |
|
47 |
-
|
48 |
-
|
49 |
-
if data_subset == "OWN Recoding/Sample":
|
50 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
51 |
inference_times1 = []
|
52 |
inference_times2 = []
|
@@ -98,6 +119,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
|
|
98 |
|
99 |
yield results_md, fig, df
|
100 |
|
|
|
101 |
else:
|
102 |
references = []
|
103 |
transcriptions1 = []
|
@@ -229,7 +251,7 @@ def load_model(model_id:str):
|
|
229 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
230 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
231 |
model.config.forced_decoder_ids = None
|
232 |
-
else:
|
233 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
234 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
235 |
|
@@ -269,7 +291,7 @@ def model_compute(model, processor, sample, model_id):
|
|
269 |
transcription = processor.tokenizer.normalize(transcription[0])
|
270 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
271 |
return transcription
|
272 |
-
else:
|
273 |
sample = sample["audio"]
|
274 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
275 |
predicted_ids = model.generate(input_features)
|
|
|
1 |
+
# Import Libraries to load Models
|
2 |
from transformers import WhisperProcessor, WhisperForConditionalGeneration
|
3 |
from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
|
4 |
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
|
5 |
+
|
6 |
+
# Import Libraries to access Datasets
|
7 |
from datasets import load_dataset
|
8 |
from datasets import Audio
|
9 |
+
|
10 |
+
# Helper Libraries
|
11 |
+
import plotly.graph_objs as go
|
12 |
import evaluate
|
13 |
import librosa
|
14 |
import torch
|
|
|
16 |
import pandas as pd
|
17 |
import time
|
18 |
|
19 |
+
# This constant determines on how many samples the Models are evaluated on
|
20 |
+
N_SAMPLES = 50
|
21 |
|
22 |
+
# Load the WER Metric
|
23 |
wer_metric = evaluate.load("wer")
|
24 |
|
|
|
25 |
|
26 |
+
def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
|
27 |
+
"""
|
28 |
+
Main Function running an entire evaluation cycle
|
29 |
+
|
30 |
+
Params:
|
31 |
+
- data_subset (str) :The name of a valid Dataset to choose from ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
|
32 |
+
- model_1 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
33 |
+
- model_2 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
|
34 |
+
- own_audio (gr.Audio) :The return value of an gr.Audio component (sr, audio (as numpy array))
|
35 |
+
- own_transcription (str) :The paired transcription to the own_audio
|
36 |
+
"""
|
37 |
+
|
38 |
+
# A little bit of Error Handling
|
39 |
+
if data_subset is None and own_audio is None and own_transcription is None:
|
40 |
raise ValueError("No Dataset selected")
|
41 |
if model_1 is None:
|
42 |
raise ValueError("No Model 1 selected")
|
43 |
if model_2 is None:
|
44 |
raise ValueError("No Model 2 selected")
|
45 |
|
46 |
+
# Load the selected Dataset but only N_SAMPLES of it
|
47 |
if data_subset == "Common Voice":
|
48 |
dataset, text_column = load_Common_Voice()
|
49 |
elif data_subset == "Librispeech ASR clean":
|
50 |
dataset, text_column = load_Librispeech_ASR_clean()
|
51 |
elif data_subset == "Librispeech ASR other":
|
52 |
dataset, text_column = load_Librispeech_ASR_other()
|
53 |
+
elif data_subset == "OWN Recording/Sample":
|
54 |
sr, audio = own_audio
|
55 |
audio = audio.astype(np.float32)
|
56 |
print("AUDIO: ", type(audio), audio)
|
|
|
58 |
else:
|
59 |
# if data_subset is None then still load load_Common_Voice
|
60 |
dataset, text_column = load_Common_Voice()
|
|
|
61 |
|
62 |
+
# I have left the print statements because users have access to the logs in Spaces and this might help to understand what's going on
|
63 |
+
print("Dataset Loaded")
|
64 |
+
# Load the selected Models
|
65 |
model1, processor1 = load_model(model_1)
|
66 |
model2, processor2 = load_model(model_2)
|
67 |
print("Models Loaded")
|
68 |
|
69 |
+
# In case a own Recording is selected only a single sample has to be evaluated
|
70 |
+
if data_subset == "OWN Recording/Sample":
|
|
|
71 |
sample = {"audio":{"array":audio,"sampling_rate":16000}}
|
72 |
inference_times1 = []
|
73 |
inference_times2 = []
|
|
|
119 |
|
120 |
yield results_md, fig, df
|
121 |
|
122 |
+
# In case a Dataset has been selected
|
123 |
else:
|
124 |
references = []
|
125 |
transcriptions1 = []
|
|
|
251 |
processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
|
252 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
|
253 |
model.config.forced_decoder_ids = None
|
254 |
+
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
255 |
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
|
256 |
processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
|
257 |
|
|
|
291 |
transcription = processor.tokenizer.normalize(transcription[0])
|
292 |
print("TRANSCRIPTION Whisper Large v2: ", transcription)
|
293 |
return transcription
|
294 |
+
else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
|
295 |
sample = sample["audio"]
|
296 |
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
|
297 |
predicted_ids = model.generate(input_features)
|