wasertech's picture
tidy up
60ea0fc
raw
history blame
8.94 kB
import gradio as gr
import pandas as pd
import json
from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS
from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
from utils_display import AutoEvalColumn, fields, make_clickable_model, styled_error, styled_message
from datetime import datetime, timezone
LAST_UPDATED = "Sep 9th 2023"
column_names = {
"MODEL": "Model",
"Avg. WER": "Average WER ⬇️",
"RTF": "RTF (1e-3) ⬇️",
"Common Voice WER": "Common Voice WER ⬇️",
"D_AVG_CV_WER": "Delta AVG-CV WER",
}
# Skipping testings just uing the numbers computed in the original space for my sanity sake
# eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
# if not csv_results.exists():
# raise Exception(f"CSV file {csv_results} does not exist locally")
# # Get csv with data and parse columns
# original_df = pd.read_csv(csv_results)
data = [
["nvidia/stt_en_fastconformer_transducer_xlarge",
12.3, 8.06, 7.26],
["nvidia/stt_en_fastconformer_transducer_xxlarge",
14.4, 8.07, 6.07],
["openai/whisper-large-v2",
12.7, 8.16, 10.12],
["nvidia/stt_en_fastconformer_ctc_xxlarge",
5, 8.34, 8.31],
["nvidia/stt_en_conformer_ctc_large",
7.5, 8.39, 9.1],
["openai/whisper-medium.en",
10.7, 8.5, 11.96],
["nvidia/stt_en_fastconformer_ctc_xlarge",
2.9, 8.52, 7.51],
["nvidia/stt_en_fastconformer_ctc_large",
1.8, 8.9, 8.56],
["nvidia/stt_en_fastconformer_transducer_large",
10.4, 8.94, 8.04],
["openai/whisper-large",
12.7, 9.2, 10.92],
["nvidia/stt_en_conformer_transducer_large",
21.8, 9.27, 7.36],
["openai/whisper-small.en",
8.3, 9.34, 15.13],
["nvidia/stt_en_conformer_transducer_small",
17.7, 10.81, 14.35],
["openai/whisper-base.en",
7.2, 11.67, 21.77],
["nvidia/stt_en_conformer_ctc_small",
3.2, 11.77, 16.59],
["patrickvonplaten/wav2vec2-large-960h-lv60-self-4-gram",
20.1, 13.65, 20.05],
["facebook/wav2vec2-large-960h-lv60-self",
2.5, 14.47, 22.15],
["openai/whisper-tiny.en",
9.1, 14.96, 31.09],
["patrickvonplaten/hubert-xlarge-ls960-ft-4-gram",
24.5, 15.11, 19.16],
["speechbrain/asr-wav2vec2-librispeech",
2.6, 15.61, 23.71],
["facebook/hubert-xlarge-ls960-ft",
6.3, 15.81, 22.05],
["facebook/mms-1b-all",
5.9, 15.85, 21.23],
["facebook/hubert-large-ls960-ft",
2.6, 15.93, 23.12],
["facebook/wav2vec2-large-robust-ft-libri-960h",
2.7, 16.07, 22.57],
["facebook/wav2vec2-conformer-rel-pos-large-960h-ft",
5.2, 17, 23.01],
["facebook/wav2vec2-conformer-rope-large-960h-ft",
7.8, 17.06, 23.08],
["facebook/wav2vec2-large-960h",
1.8, 21.76, 34.01],
["facebook/wav2vec2-base-960h",
1.2, 26.41, 41.75]
]
columns = [
"Model", "RTF (1e-3) ⬇️", "Average WER ⬇️", "Common Voice WER ⬇️"
]
original_df = pd.DataFrame(data, columns=columns)
# Formats the columns
def formatter(x):
x = round(x, 2)
return x
for col in original_df.columns:
if col == "model":
original_df[col] = original_df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
else:
original_df[col] = original_df[col].apply(formatter) # For numerical values
original_df.rename(columns=column_names, inplace=True)
original_df.sort_values(by='Common Voice', inplace=True)
# Compute delta between average WER and CV WER
original_df['Detla Avg. C.V. WER'] = original_df['Average WER ⬇️'] - original_df['Common Voice WER ⬇️']
original_df['Detla Avg. C.V. WER'] = original_df['Detla Avg. C.V. WER'].apply(formatter)
COLS = [c.name for c in fields(AutoEvalColumn)]
TYPES = [c.type for c in fields(AutoEvalColumn)]
def request_model(model_text, chbcoco2017):
# Determine the selected checkboxes
dataset_selection = []
if chbcoco2017:
dataset_selection.append("ESB Datasets tests only")
if len(dataset_selection) == 0:
return styled_error("You need to select at least one dataset")
base_model_on_hub, error_msg = is_model_on_hub(model_text)
if not base_model_on_hub:
return styled_error(f"Base model '{model_text}' {error_msg}")
# Construct the output dictionary
current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
required_datasets = ', '.join(dataset_selection)
eval_entry = {
"date": current_time,
"model": model_text,
"datasets_selected": required_datasets
}
# Prepare file path
DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True)
fn_datasets = '@ '.join(dataset_selection)
filename = model_text.replace("/","@") + "@@" + fn_datasets
if filename in requested_models:
return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.")
try:
filename_ext = filename + ".txt"
out_filepath = DIR_OUTPUT_REQUESTS / filename_ext
# Write the results to a text file
with open(out_filepath, "w") as f:
f.write(json.dumps(eval_entry))
upload_file(filename, out_filepath)
# Include file in the list of uploaded files
requested_models.append(filename)
# Remove the local file
out_filepath.unlink()
return styled_message("🤗 Your request has been submitted and will be evaluated soon!</p>")
except Exception as e:
return styled_error(f"Error submitting request!")
with gr.Blocks() as demo:
gr.HTML(BANNER, elem_id="banner")
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
CUSTOM_MESSAGE="""Legend:
This space is a fork of the original [hf-audio/open_asr_leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard).
It aims to show how CommonVoice Test Set is large enough for most languages to give a relativly good approximation of the average WER/CER but at a much lower computational cost.
#### Why is this useful?
Because, it gives us a standardized test set for most languages allowing us to programatically choose a relatively good model for any CV supported languages.
`Model`, `RTF (1e-3) ⬇️` and`Average WER ⬇️` were reported from [hf-audio/open_asr_leaderboard](https://huggingface.co/spaces/hf-audio/open_asr_leaderboard) the 9 using version from the 7 Sept. 2023.
### Results
CommonVoice Test test give a word error rate (WER) within less than 20 points of the average WER.
It's not good. Don't use only CommonVoice to choose the most adequate architecture.
But to quickly find a suitable ASR model for a large panel of lanugages in a programatic fashion, it's close enough."""
gr.Markdown(CUSTOM_MESSAGE, elem_classes="markdown-text")
with gr.Tabs(elem_classes="tab-buttons") as tabs:
with gr.TabItem("🏅 Leaderboard", elem_id="od-benchmark-tab-table", id=0):
leaderboard_table = gr.components.Dataframe(
value=original_df,
datatype=TYPES,
max_rows=None,
elem_id="leaderboard-table",
interactive=False,
visible=True,
)
with gr.TabItem("📈 Metrics", elem_id="od-benchmark-tab-table", id=1):
gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
with gr.TabItem("✉️✨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
with gr.Column():
gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text")
with gr.Column():
gr.Markdown("Select a dataset:", elem_classes="markdown-text")
with gr.Column():
model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
chb_coco2017 = gr.Checkbox(label="COCO validation 2017 dataset", visible=False, value=True, interactive=False)
with gr.Column():
mdw_submission_result = gr.Markdown()
btn_submitt = gr.Button(value="🚀 Request")
btn_submitt.click(request_model,
[model_name_textbox, chb_coco2017],
mdw_submission_result)
gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
gr.Textbox(
value=CITATION_TEXT, lines=7,
label="Copy the BibTeX snippet to cite this source",
elem_id="citation-button",
).style(show_copy_button=True)
demo.launch()