Spaces:

justus-tobias
/

ASR_Model_Comparison

Sleeping

App Files Files Community

j-tobias commited on Aug 16

Commit

15f66cd

•

1 Parent(s): 234fe59

latest changes

Browse files

Files changed (4) hide show

app.py +41 -24
cards.txt +4 -2
model_evaluation.ipynb +278 -0
processing.py +33 -11

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import gradio as gr
 from processing import run
-import json
 from huggingface_hub import login
 import os
@@ -11,6 +11,7 @@ import os
 hf_token = os.getenv("HF_Token")
 login(hf_token)
 # def hf_login():
 #     hf_token = os.getenv("HF_Token")
 #     if hf_token is None:
@@ -26,10 +27,13 @@ login(hf_token)
 # GENERAL OPTIONS FOR MODELS AND DATASETS
 MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
-DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recoding/Sample"]
 # HELPER FUNCTIONS
 def get_card(selected_model:str)->str:
     with open("cards.txt", "r") as f:
         cards = f.read()
@@ -42,37 +46,38 @@ def get_card(selected_model:str)->str:
     return "Unknown Model"
 def is_own(selected_option):
-    if selected_option == "OWN Recoding/Sample":
         return gr.update(visible=True), gr.update(visible=True)
     else:
         return gr.update(visible=False), gr.update(visible=False)
 def make_visible():
     return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
 # THE ACTUAL APP
 with gr.Blocks() as demo:
     gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
-    gr.Markdown("""
-""")
-    gr.Markdown("""### Welcome to ASR Model Comparison Hub! 🎉
-Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
-Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
-ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
-Happy experimenting and comparing! 🚀""")
@@ -80,13 +85,17 @@ Happy experimenting and comparing! 🚀""")
         with gr.Column(scale=1):
             pass
         with gr.Column(scale=5):
             data_subset = gr.Radio(
                 value="Common Voice",
                 choices=DATASET_OPTIONS,
                 label="Data subset / Own Sample",
             )
-            own_audio = gr.Audio(sources=['microphone'], visible=False)
-            own_transcription = gr.TextArea(lines=2, visible=False)
             data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
         with gr.Column(scale=1):
             pass
@@ -94,31 +103,37 @@ Happy experimenting and comparing! 🚀""")
     with gr.Row():
         with gr.Column(scale=1):
             model_1 = gr.Dropdown(
                 choices=MODEL_OPTIONS,
-                label="Select Model"
             )
             model_1_card = gr.Markdown("")
         with gr.Column(scale=1):
             model_2 = gr.Dropdown(
                 choices=MODEL_OPTIONS,
-                label="Select Model"
             )
             model_2_card = gr.Markdown("")
         model_1.change(get_card, inputs=model_1, outputs=model_1_card)
         model_2.change(get_card, inputs=model_2, outputs=model_2_card)
     eval_btn = gr.Button(
         value="Evaluate",
         variant="primary",
         size="sm")
-    results_title = gr.Markdown('## <p style="text-align: center;">Results</p>', visible=False)
     results_md = gr.Markdown("")
     results_plot = gr.Plot(show_label=False, visible=False)
     results_df = gr.DataFrame(
@@ -127,6 +142,8 @@ Happy experimenting and comparing! 🚀""")
         interactive=False,  # Allow users to interact with the DataFrame
         wrap=True,  # Ensure text wraps to multiple lines
     )
     eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
     eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)

 import gradio as gr
 from processing import run
+import json         # is only used if hf_login() is used
 from huggingface_hub import login
 import os
 hf_token = os.getenv("HF_Token")
 login(hf_token)
+# I have used this function for logging into HF using a credentials file
 # def hf_login():
 #     hf_token = os.getenv("HF_Token")
 #     if hf_token is None:
 # GENERAL OPTIONS FOR MODELS AND DATASETS
 MODEL_OPTIONS = ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
+DATASET_OPTIONS = ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
 # HELPER FUNCTIONS
 def get_card(selected_model:str)->str:
+    """
+    This function retrieves the markdown text displayed for each selected Model
+    """
     with open("cards.txt", "r") as f:
         cards = f.read()
     return "Unknown Model"
 def is_own(selected_option):
+    """
+    In case the User wants to record an own Sample, this function makes the Components visible
+    """
+    if selected_option == "OWN Recording/Sample":
         return gr.update(visible=True), gr.update(visible=True)
     else:
         return gr.update(visible=False), gr.update(visible=False)
 def make_visible():
+    """
+    This function makes the Components needed for displaying the Results visible
+    """
     return gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)
+# Introduction and Information about the APP
+INTRODUCTION = """### Welcome to ASR Model Comparison Hub! 🎉
+Hey there, and welcome to an app designed just for developers like you, who are passionate about pushing the boundaries of Automatic Speech Recognition (ASR) technology!
+Here, you can easily compare different ASR models by selecting a dataset and choosing two models from the dropdown to see how they stack up against each other. If you're feeling creative, go ahead and select 'OWN' as your dataset option to upload your own audio file or record something new right in the app. Don’t forget to provide a transcription, and the app will handle the rest!
+ASR Model Comparison Hub uses the Word Error Rate (WER) ⬇️ (the lower the better) metric to give you a clear picture of each model's performance. And hey, don't miss out on checking the **Amazing Leaderboard** where you can see how a wide range of models have been evaluated—[Check it out here](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
+Happy experimenting and comparing! 🚀"""
 # THE ACTUAL APP
 with gr.Blocks() as demo:
     gr.Markdown('# <p style="text-align: center;">ASR Model Comparison 💬</p>')
+    gr.Markdown(INTRODUCTION)
         with gr.Column(scale=1):
             pass
         with gr.Column(scale=5):
+            # Select a Dataset to evaluate the Models on
             data_subset = gr.Radio(
                 value="Common Voice",
                 choices=DATASET_OPTIONS,
                 label="Data subset / Own Sample",
             )
+            # Components used to record an own sample
+            own_audio = gr.Audio(sources=['microphone'], visible=False, label=None)
+            own_transcription = gr.TextArea(lines=2, visible=False, label=None)
+            # Event Listiner to display the correct components
             data_subset.change(is_own, inputs=[data_subset], outputs=[own_audio, own_transcription])
         with gr.Column(scale=1):
             pass
     with gr.Row():
+        # This Column is for selecting the First Model
         with gr.Column(scale=1):
             model_1 = gr.Dropdown(
                 choices=MODEL_OPTIONS,
+                label=None
             )
             model_1_card = gr.Markdown("")
+        # This Columnis for selecting the Second Model
         with gr.Column(scale=1):
             model_2 = gr.Dropdown(
                 choices=MODEL_OPTIONS,
+                label=None
             )
             model_2_card = gr.Markdown("")
+        # Event Listiners if a model has been selected
         model_1.change(get_card, inputs=model_1, outputs=model_1_card)
         model_2.change(get_card, inputs=model_2, outputs=model_2_card)
+    # Main Action Button to start the Evaluation
     eval_btn = gr.Button(
         value="Evaluate",
         variant="primary",
         size="sm")
+    # This Section Displays the Evaluation Results
+    results_title = gr.Markdown(
+        '## <p style="text-align: center;">Results</p>',
+        visible=False
+    )
     results_md = gr.Markdown("")
     results_plot = gr.Plot(show_label=False, visible=False)
     results_df = gr.DataFrame(
         interactive=False,  # Allow users to interact with the DataFrame
         wrap=True,  # Ensure text wraps to multiple lines
     )
+    # Event Listeners if the main aaction button has been trigered
     eval_btn.click(make_visible, outputs=[results_plot, results_df, results_title])
     eval_btn.click(run, [data_subset, model_1, model_2, own_audio, own_transcription], [results_md, results_plot, results_df], show_progress=False)

cards.txt CHANGED Viewed

@@ -25,11 +25,13 @@
 - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
 - Training Data: ?
 @@
-#### Whisper Large v2
 - ID: openai/whisper-large-v2
 - Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
 - Creator: openai
 - Finetuned: No
 - Model Size: 1.54 B Parameters
 - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
-- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.

 - Model Paper: [Wav2vec 2.0: Learning the structure of speech from raw audio](https://ai.meta.com/blog/wav2vec-20-learning-the-structure-of-speech-from-raw-audio/)
 - Training Data: ?
 @@
+#### Whisper Large v2
 - ID: openai/whisper-large-v2
 - Hugging Face: [model](https://huggingface.co/openai/whisper-large-v2)
 - Creator: openai
 - Finetuned: No
 - Model Size: 1.54 B Parameters
 - Model Paper: [Robust Speech Recognition via Large-Scale Weak Supervision](https://arxiv.org/abs/2212.04356)
+- Training Data: The models are trained on 680,000 hours of audio and the corresponding transcripts collected from the internet. 65% of this data (or 438,000 hours) represents English-language audio and matched English transcripts, roughly 18% (or 126,000 hours) represents non-English audio and English transcripts, while the final 17% (or 117,000 hours) represents non-English audio and the corresponding transcript. This non-English data represents 98 different languages.
+(evaluating this model might take a while due to it's size)

model_evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,278 @@

+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "H1UloQj623Ik"
+      },
+      "source": [
+        "## Model Evaluation\n",
+        "\n",
+        "Hi, there welcome to my notebook! 👋\n",
+        "\n",
+        "This notebook is all about evaluating different models using a small subset of a larger Dataset.\n",
+        "\n",
+        "This Notebook is self contained meaning that expect for installing necessary libraries you can run all cells in order and everything should work\n",
+        "If not, feel free to leave me a message and i'll give my best to fix the issue\n",
+        "\n",
+        "All you need for this notebook to work is a **HuggingFace token**.\n",
+        "\n",
+        "If you don't know how to find it.\n",
+        "\n",
+        "Go to your Hugging Face\n",
+        "> Profile -> Settings -> Access Tokens -> + Create new token\n",
+        "\n",
+        "You can find the Notebook in Google Colab [here](https://colab.research.google.com/drive/1awfo4_Llrg-aypEc_MdJXcqQMj3r_Fy2?usp=share_link)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "hDqZY8i85pOj"
+      },
+      "source": [
+        "### 1. Import all necessary libraries"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "iw-5LI1u2x7a"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor\n",
+        "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
+        "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n",
+        "from huggingface_hub import login\n",
+        "from datasets import load_dataset\n",
+        "from datasets import Audio\n",
+        "from tqdm import tqdm\n",
+        "import evaluate\n",
+        "import torch"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gc4FRXzm5oTt"
+      },
+      "source": [
+        "### 2. Log in & set constants"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "6qTB32KR56lK"
+      },
+      "outputs": [],
+      "source": [
+        "# Login\n",
+        "login(\"hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx\")\n",
+        "\n",
+        "# Set constants\n",
+        "N_SAMPLES = 100"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "vdZmlee66ItN"
+      },
+      "source": [
+        "### 3. Load Dataset & Metric"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "u4MDh9HA6QwF"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the Dataset\n",
+        "dataset = load_dataset(\"librispeech_asr\", \"clean\", split=\"test\", streaming=True, token=True, trust_remote_code=True)\n",
+        "dataset = dataset.cast_column(\"audio\", Audio(sampling_rate=16000))\n",
+        "dataset = dataset.take(N_SAMPLES)\n",
+        "\n",
+        "# Load the Evaluation Metric\n",
+        "wer_metric = evaluate.load(\"wer\")\n",
+        "\n",
+        "# Create Dictionary to Store Results\n",
+        "results = {\n",
+        "    \"facebook/wav2vec2-base-960h\":0,\n",
+        "    \"openai/whisper-tiny.en\":0,\n",
+        "    \"facebook/s2t-medium-librispeech-asr\":0\n",
+        "}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "JDRzDiZ86XEa"
+      },
+      "source": [
+        "### 4. Evaluate the first Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "tNWLJ6bp6bnc"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the 1. ASR Model\n",
+        "processor = Wav2Vec2Processor.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
+        "model = Wav2Vec2ForCTC.from_pretrained(\"facebook/wav2vec2-base-960h\")\n",
+        "\n",
+        "\n",
+        "# Run Inference For the First Model\n",
+        "predictions = []\n",
+        "references = []\n",
+        "\n",
+        "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
+        "    input_values = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_values  # Batch size 1\n",
+        "    logits = model(input_values).logits\n",
+        "    predicted_ids = torch.argmax(logits, dim=-1)\n",
+        "    transcription = processor.batch_decode(predicted_ids)\n",
+        "    predictions.append(transcription[0])\n",
+        "    references.append(item[\"text\"])\n",
+        "\n",
+        "\n",
+        "\n",
+        "wer = wer_metric.compute(references=references, predictions=predictions)\n",
+        "wer = round(100 * wer, 2)\n",
+        "print(\"WER:\", wer)\n",
+        "results[\"facebook/wav2vec2-base-960h\"] = wer"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "LObMf9h-6eo_"
+      },
+      "source": [
+        "### 5. Evaluate the second Model\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "kslHlHA86okx"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the 2. ASR Model\n",
+        "processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny.en\")\n",
+        "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny.en\")\n",
+        "\n",
+        "\n",
+        "# Run Inference For the First Model\n",
+        "predictions = []\n",
+        "references = []\n",
+        "\n",
+        "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
+        "    input_features = processor(item[\"audio\"][\"array\"], sampling_rate=16000, return_tensors=\"pt\", padding=\"longest\").input_features  # Batch size 1\n",
+        "    predicted_ids = model.generate(input_features=input_features)\n",
+        "    transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
+        "    predictions.append(processor.tokenizer.normalize(transcription[0]))\n",
+        "    references.append(processor.tokenizer.normalize(item[\"text\"]))\n",
+        "\n",
+        "\n",
+        "\n",
+        "wer = wer_metric.compute(references=references, predictions=predictions)\n",
+        "wer = round(100 * wer, 2)\n",
+        "print(\"WER:\", wer)\n",
+        "results[\"openai/whisper-tiny.en\"] = wer"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VXKxHUFi6puQ"
+      },
+      "source": [
+        "### 6. Evaluate the third Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "mKQgkwnf6vVM"
+      },
+      "outputs": [],
+      "source": [
+        "# Load the 3. ASR Model\n",
+        "model = Speech2TextForConditionalGeneration.from_pretrained(\"facebook/s2t-medium-librispeech-asr\")\n",
+        "processor = Speech2TextProcessor.from_pretrained(\"facebook/s2t-medium-librispeech-asr\", do_upper_case=True)\n",
+        "\n",
+        "\n",
+        "# Run Inference For the First Model\n",
+        "predictions = []\n",
+        "references = []\n",
+        "\n",
+        "for i, item in tqdm(enumerate(dataset), total=N_SAMPLES):\n",
+        "    sample = item[\"audio\"]\n",
+        "    features = processor(sample[\"array\"], sampling_rate=16000, padding=True, return_tensors=\"pt\")\n",
+        "    input_features = features.input_features\n",
+        "    attention_mask = features.attention_mask\n",
+        "    gen_tokens = model.generate(input_features=input_features, attention_mask=attention_mask)\n",
+        "    transcription= processor.batch_decode(gen_tokens, skip_special_tokens=True)\n",
+        "    predictions.append(transcription[0])\n",
+        "    references.append(item[\"text\"])\n",
+        "\n",
+        "\n",
+        "\n",
+        "wer = wer_metric.compute(references=references, predictions=predictions)\n",
+        "wer = round(100 * wer, 2)\n",
+        "print(\"WER:\", wer)\n",
+        "results[\"facebook/s2t-medium-librispeech-asr\"] = wer"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "D413vLho6v_v"
+      },
+      "source": [
+        "### 7. Find the winning Model"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pAlJylIB60pL"
+      },
+      "outputs": [],
+      "source": [
+        "winning_model = min(results, key=results.get)\n",
+        "min_wer = results[winning_model]\n",
+        "\n",
+        "print(f\"The model {winning_model} has the lowest WER Score achieved with WER: {min_wer}\")"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}

processing.py CHANGED Viewed

@@ -1,9 +1,14 @@
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import plotly.graph_objs as go
 from datasets import load_dataset
 from datasets import Audio
 import evaluate
 import librosa
 import torch
@@ -11,26 +16,41 @@ import numpy as np
 import pandas as pd
 import time
-N_SAMPLES = 30
 wer_metric = evaluate.load("wer")
-def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
-    if data_subset is None:
         raise ValueError("No Dataset selected")
     if model_1 is None:
         raise ValueError("No Model 1 selected")
     if model_2 is None:
         raise ValueError("No Model 2 selected")
     if data_subset == "Common Voice":
         dataset, text_column = load_Common_Voice()
     elif data_subset == "Librispeech ASR clean":
         dataset, text_column = load_Librispeech_ASR_clean()
     elif data_subset == "Librispeech ASR other":
         dataset, text_column = load_Librispeech_ASR_other()
-    elif data_subset == "OWN Recoding/Sample":
         sr, audio = own_audio
         audio = audio.astype(np.float32)
         print("AUDIO: ", type(audio), audio)
@@ -38,15 +58,16 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
     else:
         # if data_subset is None then still load load_Common_Voice
         dataset, text_column = load_Common_Voice()
-    print("Dataset Loaded")
     model1, processor1 = load_model(model_1)
     model2, processor2 = load_model(model_2)
     print("Models Loaded")
-    if data_subset == "OWN Recoding/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
         inference_times1 = []
         inference_times2 = []
@@ -98,6 +119,7 @@ def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:
         yield results_md, fig, df
     else:
         references = []
         transcriptions1 = []
@@ -229,7 +251,7 @@ def load_model(model_id:str):
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
         model.config.forced_decoder_ids = None
-    else:
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -269,7 +291,7 @@ def model_compute(model, processor, sample, model_id):
         transcription = processor.tokenizer.normalize(transcription[0])
         print("TRANSCRIPTION Whisper Large v2: ", transcription)
         return transcription
-    else:
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
         predicted_ids = model.generate(input_features)

+# Import Libraries to load Models
 from transformers import WhisperProcessor, WhisperForConditionalGeneration
 from transformers import Speech2TextForConditionalGeneration, Speech2TextProcessor
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+# Import Libraries to access Datasets
 from datasets import load_dataset
 from datasets import Audio
+# Helper Libraries
+import plotly.graph_objs as go
 import evaluate
 import librosa
 import torch
 import pandas as pd
 import time
+# This constant determines on how many samples the Models are evaluated on
+N_SAMPLES = 50
+# Load the WER Metric
 wer_metric = evaluate.load("wer")
+def run(data_subset:str, model_1:str, model_2:str, own_audio, own_transcription:str):
+    """
+    Main Function running an entire evaluation cycle
+    Params:
+        - data_subset (str) :The name of a valid Dataset to choose from ["Common Voice", "Librispeech ASR clean", "Librispeech ASR other", "OWN Recording/Sample"]
+        - model_1 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
+        - model_2 (str) :The name of a valid model to choose form ["openai/whisper-tiny.en", "facebook/s2t-medium-librispeech-asr", "facebook/wav2vec2-base-960h","openai/whisper-large-v2"]
+        - own_audio (gr.Audio) :The return value of an gr.Audio component (sr, audio (as numpy array))
+        - own_transcription (str) :The paired transcription to the own_audio
+    """
+    # A little bit of Error Handling
+    if data_subset is None and own_audio is None and own_transcription is None:
         raise ValueError("No Dataset selected")
     if model_1 is None:
         raise ValueError("No Model 1 selected")
     if model_2 is None:
         raise ValueError("No Model 2 selected")
+    # Load the selected Dataset but only N_SAMPLES of it
     if data_subset == "Common Voice":
         dataset, text_column = load_Common_Voice()
     elif data_subset == "Librispeech ASR clean":
         dataset, text_column = load_Librispeech_ASR_clean()
     elif data_subset == "Librispeech ASR other":
         dataset, text_column = load_Librispeech_ASR_other()
+    elif data_subset == "OWN Recording/Sample":
         sr, audio = own_audio
         audio = audio.astype(np.float32)
         print("AUDIO: ", type(audio), audio)
     else:
         # if data_subset is None then still load load_Common_Voice
         dataset, text_column = load_Common_Voice()
+    # I have left the print statements because users have access to the logs in Spaces and this might help to understand what's going on
+    print("Dataset Loaded")
+    # Load the selected Models
     model1, processor1 = load_model(model_1)
     model2, processor2 = load_model(model_2)
     print("Models Loaded")
+    # In case a own Recording is selected only a single sample has to be evaluated
+    if data_subset == "OWN Recording/Sample":
         sample = {"audio":{"array":audio,"sampling_rate":16000}}
         inference_times1 = []
         inference_times2 = []
         yield results_md, fig, df
+    # In case a Dataset has been selected
     else:
         references = []
         transcriptions1 = []
         processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
         model.config.forced_decoder_ids = None
+    else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
         processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
         transcription = processor.tokenizer.normalize(transcription[0])
         print("TRANSCRIPTION Whisper Large v2: ", transcription)
         return transcription
+    else: # In case no model has been selected the Whipser-Tiny.En is selected - just for completeness
         sample = sample["audio"]
         input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
         predicted_ids = model.generate(input_features)