|
""" |
|
Main module for the WhisperKit Evaluation Dashboard. |
|
This module sets up and runs the Gradio interface for the WhisperKit Evaluation Dashboard, |
|
allowing users to explore and compare speech recognition model performance across different |
|
devices, operating systems, and datasets. |
|
""" |
|
|
|
import json |
|
import os |
|
import re |
|
from math import ceil, floor |
|
|
|
import gradio as gr |
|
import pandas as pd |
|
from argmax_gradio_components import RangeSlider |
|
from dotenv import load_dotenv |
|
from huggingface_hub import login |
|
|
|
|
|
from constants import ( |
|
BANNER_TEXT, |
|
CITATION_BUTTON_LABEL, |
|
CITATION_BUTTON_TEXT, |
|
COL_NAMES, |
|
HEADER, |
|
METHODOLOGY_TEXT, |
|
PERFORMANCE_TEXT, |
|
) |
|
from utils import ( |
|
add_datasets_to_performance_columns, |
|
calculate_quality_parity, |
|
create_initial_performance_column_dict, |
|
css, |
|
fields, |
|
get_os_name_and_version, |
|
make_model_name_clickable_link, |
|
plot_metric, |
|
read_json_line_by_line, |
|
) |
|
|
|
|
|
load_dotenv() |
|
|
|
|
|
HF_TOKEN = os.getenv("HF_TOKEN") |
|
|
|
|
|
login(token=HF_TOKEN, add_to_git_credential=True) |
|
|
|
|
|
repo_id = "argmaxinc/whisperkit-evals-dataset" |
|
directory = "xcresults/benchmark_results" |
|
local_dir = "" |
|
|
|
|
|
PERFORMANCE_DATA = read_json_line_by_line("dashboard_data/performance_data.json") |
|
with open("dashboard_data/version.json", "r") as file: |
|
VERSION_DATA = json.load(file) |
|
|
|
|
|
QUALITY_DATA = read_json_line_by_line("dashboard_data/quality_data.json") |
|
|
|
SHA_TO_VERSION = { |
|
VERSION_DATA["releases"][i]: VERSION_DATA["versions"][i] |
|
for i in range(len(VERSION_DATA["versions"])) |
|
} |
|
|
|
|
|
benchmark_df = pd.json_normalize(PERFORMANCE_DATA) |
|
releases = VERSION_DATA["releases"] |
|
|
|
|
|
benchmark_df["timestamp"] = pd.to_datetime(benchmark_df["timestamp"]).dt.tz_localize( |
|
None |
|
) |
|
|
|
|
|
benchmark_df["english_wer"] = benchmark_df["average_wer"] |
|
|
|
sorted_performance_df = ( |
|
benchmark_df.assign(model_len=benchmark_df["model"].str.len()) |
|
.sort_values( |
|
by=["model_len", "model", "device", "os", "timestamp"], |
|
ascending=[True, True, True, True, False], |
|
) |
|
.drop(columns=["model_len"]) |
|
.drop_duplicates(subset=["model", "device", "os"], keep="first") |
|
.reset_index(drop=True) |
|
) |
|
|
|
|
|
dataset_speed_columns = [ |
|
col for col in sorted_performance_df.columns if col.startswith("dataset_speed.") |
|
] |
|
dataset_toks_columns = [ |
|
col |
|
for col in sorted_performance_df.columns |
|
if col.startswith("dataset_tokens_per_second.") |
|
] |
|
|
|
|
|
PERFORMANCE_DATASETS = [col.split(".")[-1] for col in dataset_speed_columns] |
|
|
|
|
|
performance_df = sorted_performance_df[ |
|
[ |
|
"model", |
|
"device", |
|
"os", |
|
"english_wer", |
|
"qoi", |
|
"speed", |
|
"tokens_per_second", |
|
"timestamp", |
|
"commit_hash", |
|
] |
|
+ dataset_speed_columns |
|
+ dataset_toks_columns |
|
].copy() |
|
|
|
|
|
performance_df["parity"] = performance_df.apply( |
|
lambda row: calculate_quality_parity(QUALITY_DATA, row), axis=1 |
|
) |
|
|
|
|
|
performance_df = performance_df.rename( |
|
lambda x: COL_NAMES[x] if x in COL_NAMES else x, axis="columns" |
|
) |
|
|
|
|
|
for col in dataset_speed_columns: |
|
dataset_name = col.split(".")[-1] |
|
performance_df = performance_df.rename( |
|
columns={ |
|
col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Speed" |
|
} |
|
) |
|
|
|
for col in dataset_toks_columns: |
|
dataset_name = col.split(".")[-1] |
|
performance_df = performance_df.rename( |
|
columns={ |
|
col: f"{'Short-Form' if dataset_name == 'librispeech-10mins' else 'Long-Form'} Tok/s" |
|
} |
|
) |
|
|
|
|
|
performance_df["model_raw"] = performance_df["Model"].copy() |
|
performance_df["Model"] = performance_df["Model"].apply( |
|
lambda x: make_model_name_clickable_link(x) |
|
) |
|
|
|
|
|
initial_release_df = benchmark_df[benchmark_df["commit_hash"] == releases[-1]] |
|
PERFORMANCE_DEVICES = initial_release_df["device"].unique().tolist() |
|
PERFORMANCE_OS = ( |
|
initial_release_df["os"].apply(get_os_name_and_version).unique().tolist() |
|
) |
|
PERFORMANCE_OS.sort() |
|
|
|
|
|
initial_performance_column_dict = create_initial_performance_column_dict() |
|
|
|
performance_column_info = add_datasets_to_performance_columns( |
|
initial_performance_column_dict, PERFORMANCE_DATASETS |
|
) |
|
|
|
|
|
updated_performance_column_dict = performance_column_info["column_dict"] |
|
|
|
PerformanceAutoEvalColumn = performance_column_info["AutoEvalColumn"] |
|
|
|
|
|
PERFORMANCE_COLS = performance_column_info["COLS"] |
|
PERFORMANCE_TYPES = performance_column_info["TYPES"] |
|
PERFORMANCE_ALWAYS_HERE_COLS = performance_column_info["ALWAYS_HERE_COLS"] |
|
PERFORMANCE_TOGGLE_COLS = performance_column_info["TOGGLE_COLS"] |
|
PERFORMANCE_SELECTED_COLS = performance_column_info["SELECTED_COLS"] |
|
|
|
|
|
def get_release_devices(release): |
|
""" |
|
Get the list of devices for a specific release. |
|
|
|
:param release: Selected release hash |
|
:return: List of devices available in the release |
|
""" |
|
release_df = benchmark_df[benchmark_df["commit_hash"] == release] |
|
return release_df["device"].unique().tolist() |
|
|
|
|
|
def performance_filter( |
|
df, |
|
columns, |
|
model_query, |
|
exclude_models, |
|
devices, |
|
os, |
|
short_speed_slider, |
|
long_speed_slider, |
|
short_toks_slider, |
|
long_toks_slider, |
|
release, |
|
): |
|
""" |
|
Filters the performance DataFrame based on specified criteria. |
|
:param df: The DataFrame to be filtered. |
|
:param columns: The columns to be included in the filtered DataFrame. |
|
:param model_query: The query string to filter the 'Model' column. |
|
:param exclude_models: Models to exclude from the results. |
|
:param devices: The devices to filter the 'Device' column. |
|
:param os: The list of operating systems to filter the 'OS' column. |
|
:param short_speed_slider: The range of values to filter the 'Short-Form Speed' column. |
|
:param long_speed_slider: The range of values to filter the 'Long-Form Speed' column. |
|
:param short_toks_slider: The range of values to filter the 'Short-Form Tok/s' column. |
|
:param long_toks_slider: The range of values to filter the 'Long-Form Tok/s' column. |
|
:return: The filtered DataFrame. |
|
""" |
|
filtered_df = df[df["commit_hash"] == release] |
|
|
|
|
|
filtered_df = filtered_df[ |
|
PERFORMANCE_ALWAYS_HERE_COLS |
|
+ [c for c in PERFORMANCE_COLS if c in df.columns and c in columns] |
|
] |
|
|
|
|
|
if model_query: |
|
filtered_df = filtered_df[ |
|
filtered_df["Model"].str.contains( |
|
"|".join(q.strip() for q in model_query.split(";")), case=False |
|
) |
|
] |
|
|
|
|
|
if exclude_models: |
|
exclude_list = [m.strip() for m in exclude_models.split(";")] |
|
filtered_df = filtered_df[ |
|
~filtered_df["Model"].str.contains("|".join(exclude_list), case=False) |
|
] |
|
|
|
|
|
if devices: |
|
filtered_df = filtered_df[filtered_df["Device"].isin(devices)] |
|
else: |
|
filtered_df = pd.DataFrame(columns=filtered_df.columns) |
|
|
|
|
|
filtered_df = ( |
|
filtered_df[ |
|
( |
|
filtered_df["OS"].str.contains( |
|
"|".join(q.strip() for q in os), case=False |
|
) |
|
) |
|
] |
|
if os |
|
else pd.DataFrame(columns=filtered_df.columns) |
|
) |
|
|
|
|
|
min_short_speed, max_short_speed = short_speed_slider |
|
min_long_speed, max_long_speed = long_speed_slider |
|
min_short_toks, max_short_toks = short_toks_slider |
|
min_long_toks, max_long_toks = long_toks_slider |
|
|
|
df["Short-Form Speed"] = pd.to_numeric(df["Short-Form Speed"], errors="coerce") |
|
df["Long-Form Speed"] = pd.to_numeric(df["Long-Form Speed"], errors="coerce") |
|
df["Short-Form Tok/s"] = pd.to_numeric(df["Short-Form Tok/s"], errors="coerce") |
|
df["Long-Form Tok/s"] = pd.to_numeric(df["Long-Form Tok/s"], errors="coerce") |
|
|
|
if "Short-Form Speed" in filtered_df.columns: |
|
filtered_df = filtered_df[ |
|
(filtered_df["Short-Form Speed"] >= min_short_speed) |
|
& (filtered_df["Short-Form Speed"] <= max_short_speed) |
|
] |
|
if "Long-Form Speed" in filtered_df.columns: |
|
filtered_df = filtered_df[ |
|
(filtered_df["Long-Form Speed"] >= min_long_speed) |
|
& (filtered_df["Long-Form Speed"] <= max_long_speed) |
|
] |
|
if "Short-Form Tok/s" in filtered_df.columns: |
|
filtered_df = filtered_df[ |
|
(filtered_df["Short-Form Tok/s"] >= min_short_toks) |
|
& (filtered_df["Short-Form Tok/s"] <= max_short_toks) |
|
] |
|
if "Long-Form Tok/s" in filtered_df.columns: |
|
filtered_df = filtered_df[ |
|
(filtered_df["Long-Form Tok/s"] >= min_long_toks) |
|
& (filtered_df["Long-Form Tok/s"] <= max_long_toks) |
|
] |
|
|
|
return filtered_df |
|
|
|
|
|
def update_performance_filters(release): |
|
""" |
|
Updates the performance filters (devices and OS) based on the selected release. |
|
|
|
:param release: Selected release hash |
|
:return: Tuple containing updated device and OS choices |
|
""" |
|
|
|
release_df = benchmark_df[benchmark_df["commit_hash"] == release] |
|
|
|
|
|
release_devices = release_df["device"].unique().tolist() |
|
release_os = release_df["os"].apply(get_os_name_and_version).unique().tolist() |
|
release_os.sort() |
|
|
|
return ( |
|
gr.update(choices=release_devices, value=release_devices), |
|
gr.update(choices=release_os, value=release_os), |
|
) |
|
|
|
|
|
def update_support_table(release): |
|
""" |
|
Updates the support table and its column configuration for a given release. |
|
|
|
:param release: Selected release hash |
|
:return: Tuple containing (updated DataFrame, updated column choices, updated column values) |
|
""" |
|
|
|
support_data = pd.read_csv(f"dashboard_data/support_data_{release[:7]}.csv") |
|
support_data.set_index(support_data.columns[0], inplace=True) |
|
|
|
|
|
support_data["Model"] = support_data["Model"].apply(lambda x: x.replace("_", "/")) |
|
support_data["Model"] = support_data["Model"].apply( |
|
lambda x: make_model_name_clickable_link(x) |
|
) |
|
|
|
|
|
support_data = ( |
|
support_data.assign(model_len=support_data["Model"].str.len()) |
|
.sort_values( |
|
by=["model_len"], |
|
ascending=[True], |
|
) |
|
.drop(columns=["model_len"]) |
|
) |
|
|
|
|
|
new_columns = support_data.columns.tolist()[1:] |
|
|
|
return ( |
|
gr.update(value=support_data, datatype=["html" for _ in support_data.columns]), |
|
gr.update(choices=new_columns, value=new_columns), |
|
gr.update(value=support_data), |
|
) |
|
|
|
|
|
diff_tab = gr.TabItem("Difference Checker", elem_id="diff_checker", id=2) |
|
text_diff_elems = [] |
|
|
|
tabs = gr.Tabs(elem_id="tab-elems") |
|
|
|
font = [ |
|
"Zwizz Regular", |
|
"IBM Plex Mono", |
|
"ui-sans-serif", |
|
"system-ui", |
|
"sans-serif", |
|
] |
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(css=css, theme=gr.themes.Base(font=font)) as demo: |
|
|
|
gr.HTML(HEADER) |
|
gr.HTML(BANNER_TEXT, elem_classes="markdown-text") |
|
gr.Markdown("### Release") |
|
release_dropdown = gr.Dropdown( |
|
choices=[ |
|
(f"{release} v{SHA_TO_VERSION[release]}", release) for release in releases |
|
], |
|
label="Select Release", |
|
value=releases[-1] if releases else None, |
|
elem_id="release-dropdown", |
|
container=False, |
|
) |
|
|
|
|
|
with tabs.render(): |
|
|
|
with gr.TabItem("Benchmark", elem_id="benchmark", id=0): |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Row(): |
|
with gr.Column(scale=6, elem_classes="filter_models_column"): |
|
filter_performance_models = gr.Textbox( |
|
placeholder="π Filter Model (separate multiple queries with ';')", |
|
label="Filter Models", |
|
) |
|
with gr.Column(scale=4, elem_classes="exclude_models_column"): |
|
exclude_performance_models = gr.Textbox( |
|
placeholder="π Exclude Model", |
|
label="Exclude Model", |
|
) |
|
with gr.Row(): |
|
with gr.Accordion("See All Columns", open=False): |
|
with gr.Row(): |
|
with gr.Column(scale=9, elem_id="performance_columns"): |
|
performance_shown_columns = gr.CheckboxGroup( |
|
choices=PERFORMANCE_TOGGLE_COLS, |
|
value=PERFORMANCE_SELECTED_COLS, |
|
label="Toggle Columns", |
|
elem_id="column-select", |
|
interactive=True, |
|
) |
|
with gr.Column( |
|
scale=1, |
|
min_width=200, |
|
elem_id="performance_select_columns", |
|
): |
|
with gr.Row(): |
|
select_all_button = gr.Button( |
|
"Select All", |
|
elem_id="select-all-button", |
|
interactive=True, |
|
) |
|
deselect_all_button = gr.Button( |
|
"Deselect All", |
|
elem_id="deselect-all-button", |
|
interactive=True, |
|
) |
|
|
|
def select_all_columns(): |
|
return PERFORMANCE_TOGGLE_COLS |
|
|
|
def deselect_all_columns(): |
|
return [] |
|
|
|
select_all_button.click( |
|
select_all_columns, |
|
inputs=[], |
|
outputs=performance_shown_columns, |
|
) |
|
deselect_all_button.click( |
|
deselect_all_columns, |
|
inputs=[], |
|
outputs=performance_shown_columns, |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Accordion("Filter Devices", open=False): |
|
with gr.Row(): |
|
with gr.Column( |
|
scale=9, elem_id="filter_devices_column" |
|
): |
|
performance_shown_devices = gr.CheckboxGroup( |
|
choices=get_release_devices(releases[-1]), |
|
value=get_release_devices(releases[-1]), |
|
label="Filter Devices", |
|
interactive=True, |
|
) |
|
with gr.Column( |
|
scale=1, |
|
min_width=200, |
|
elem_id="filter_select_devices", |
|
): |
|
with gr.Row(): |
|
select_all_devices_button = gr.Button( |
|
"Select All", |
|
elem_id="select-all-devices-button", |
|
interactive=True, |
|
) |
|
deselect_all_devices_button = gr.Button( |
|
"Deselect All", |
|
elem_id="deselect-all-devices-button", |
|
interactive=True, |
|
) |
|
|
|
def select_all_devices(release): |
|
"""Returns all devices available in the current release""" |
|
return get_release_devices(release) |
|
|
|
def deselect_all_devices(): |
|
"""Returns an empty list for deselecting all devices""" |
|
return [] |
|
|
|
select_all_devices_button.click( |
|
select_all_devices, |
|
inputs=[release_dropdown], |
|
outputs=performance_shown_devices, |
|
) |
|
deselect_all_devices_button.click( |
|
deselect_all_devices, |
|
inputs=[], |
|
outputs=performance_shown_devices, |
|
) |
|
with gr.Row(): |
|
performance_shown_os = gr.CheckboxGroup( |
|
choices=PERFORMANCE_OS, |
|
value=PERFORMANCE_OS, |
|
label="Filter OS", |
|
interactive=True, |
|
) |
|
with gr.Column(scale=1): |
|
with gr.Accordion("See Performance Filters"): |
|
with gr.Row(): |
|
with gr.Row(): |
|
min_short_speed, max_short_speed = floor( |
|
min(performance_df["Short-Form Speed"]) |
|
), ceil(max(performance_df["Short-Form Speed"])) |
|
short_speed_slider = RangeSlider( |
|
value=[min_short_speed, max_short_speed], |
|
minimum=min_short_speed, |
|
maximum=max_short_speed, |
|
step=0.001, |
|
label="Short-Form Speed", |
|
) |
|
with gr.Row(): |
|
min_long_speed, max_long_speed = floor( |
|
min(performance_df["Long-Form Speed"]) |
|
), ceil(max(performance_df["Long-Form Speed"])) |
|
long_speed_slider = RangeSlider( |
|
value=[min_long_speed, max_long_speed], |
|
minimum=min_long_speed, |
|
maximum=max_long_speed, |
|
step=0.001, |
|
label="Long-Form Speed", |
|
) |
|
with gr.Row(): |
|
with gr.Row(): |
|
min_short_toks, max_short_toks = floor( |
|
min(performance_df["Short-Form Tok/s"]) |
|
), ceil(max(performance_df["Short-Form Tok/s"])) |
|
short_toks_slider = RangeSlider( |
|
value=[min_short_toks, max_short_toks], |
|
minimum=min_short_toks, |
|
maximum=max_short_toks, |
|
step=0.001, |
|
label="Short-Form Tok/s", |
|
) |
|
with gr.Row(): |
|
min_long_toks, max_long_toks = floor( |
|
min(performance_df["Long-Form Tok/s"]) |
|
), ceil(max(performance_df["Long-Form Tok/s"])) |
|
long_toks_slider = RangeSlider( |
|
value=[min_long_toks, max_long_toks], |
|
minimum=min_long_toks, |
|
maximum=max_long_toks, |
|
step=0.001, |
|
label="Long-Form Tok/s", |
|
) |
|
with gr.Row(): |
|
gr.Markdown(PERFORMANCE_TEXT, elem_classes="markdown-text") |
|
with gr.Row(): |
|
initial_df = performance_df[ |
|
performance_df["commit_hash"] == releases[-1] |
|
] |
|
leaderboard_df = gr.components.Dataframe( |
|
value=initial_df[ |
|
PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value |
|
], |
|
headers=[ |
|
PERFORMANCE_ALWAYS_HERE_COLS + performance_shown_columns.value |
|
], |
|
datatype=[ |
|
c.type |
|
for c in fields(PerformanceAutoEvalColumn) |
|
if c.name in PERFORMANCE_COLS |
|
], |
|
elem_id="leaderboard-table", |
|
elem_classes="large-table", |
|
interactive=False, |
|
) |
|
|
|
|
|
hidden_leaderboard_df = gr.components.Dataframe( |
|
value=performance_df, |
|
headers=PERFORMANCE_COLS, |
|
datatype=[ |
|
c.type |
|
for c in fields(PerformanceAutoEvalColumn) |
|
if c.name in PERFORMANCE_COLS |
|
], |
|
visible=False, |
|
) |
|
|
|
|
|
performance_filter_inputs = [ |
|
hidden_leaderboard_df, |
|
performance_shown_columns, |
|
filter_performance_models, |
|
exclude_performance_models, |
|
performance_shown_devices, |
|
performance_shown_os, |
|
short_speed_slider, |
|
long_speed_slider, |
|
short_toks_slider, |
|
long_toks_slider, |
|
release_dropdown, |
|
] |
|
|
|
filter_output = leaderboard_df |
|
filter_performance_models.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
exclude_performance_models.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
performance_shown_columns.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
performance_shown_devices.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
performance_shown_os.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
short_speed_slider.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
long_speed_slider.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
short_toks_slider.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
long_toks_slider.change( |
|
performance_filter, performance_filter_inputs, filter_output |
|
) |
|
release_dropdown.change( |
|
fn=update_performance_filters, |
|
inputs=[release_dropdown], |
|
outputs=[performance_shown_devices, performance_shown_os], |
|
queue=False, |
|
).then( |
|
fn=performance_filter, |
|
inputs=performance_filter_inputs, |
|
outputs=filter_output, |
|
) |
|
|
|
|
|
with gr.TabItem("Timeline", elem_id="timeline", id=4): |
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("QoI", id=0): |
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
filter_qoi = gr.Textbox( |
|
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", |
|
label="Filter", |
|
) |
|
with gr.Column(scale=4): |
|
exclude_qoi = gr.Textbox( |
|
placeholder="π Exclude Model-Device-OS", |
|
label="Exclude", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
qoi_plot = gr.Plot(container=True) |
|
demo.load( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"qoi", |
|
"QoI", |
|
"QoI Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_qoi, |
|
exclude_qoi, |
|
], |
|
qoi_plot, |
|
) |
|
filter_qoi.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"qoi", |
|
"QoI", |
|
"QoI Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_qoi, |
|
exclude_qoi, |
|
], |
|
qoi_plot, |
|
) |
|
exclude_qoi.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"qoi", |
|
"QoI", |
|
"QoI Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_qoi, |
|
exclude_qoi, |
|
], |
|
qoi_plot, |
|
) |
|
|
|
with gr.TabItem("Average WER", id=1): |
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
filter_average_wer = gr.Textbox( |
|
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", |
|
label="Filter", |
|
) |
|
with gr.Column(scale=4): |
|
exclude_average_wer = gr.Textbox( |
|
placeholder="π Exclude Model-Device-OS", |
|
label="Exclude", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
average_wer_plot = gr.Plot(container=True) |
|
demo.load( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"average_wer", |
|
"Average WER", |
|
"Average WER Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_average_wer, |
|
exclude_average_wer, |
|
], |
|
average_wer_plot, |
|
) |
|
filter_average_wer.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"average_wer", |
|
"Average WER", |
|
"Average WER Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_average_wer, |
|
exclude_average_wer, |
|
], |
|
average_wer_plot, |
|
) |
|
exclude_average_wer.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"average_wer", |
|
"Average WER", |
|
"Average WER Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_average_wer, |
|
exclude_average_wer, |
|
], |
|
average_wer_plot, |
|
) |
|
|
|
with gr.TabItem("Speed", id=2): |
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
filter_speed = gr.Textbox( |
|
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", |
|
label="Filter", |
|
) |
|
with gr.Column(scale=4): |
|
exclude_speed = gr.Textbox( |
|
placeholder="π Exclude Model-Device-OS", |
|
label="Exclude", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
speed_plot = gr.Plot(container=True) |
|
demo.load( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"speed", |
|
"Speed", |
|
"Speed Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_speed, |
|
exclude_speed, |
|
], |
|
speed_plot, |
|
) |
|
filter_speed.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"speed", |
|
"Speed", |
|
"Speed Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_speed, |
|
exclude_speed, |
|
], |
|
speed_plot, |
|
) |
|
exclude_speed.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"speed", |
|
"Speed", |
|
"Speed Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_speed, |
|
exclude_speed, |
|
], |
|
speed_plot, |
|
) |
|
|
|
with gr.TabItem("Tok/s", id=3): |
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
filter_toks = gr.Textbox( |
|
placeholder="π Filter Model-Device-OS (separate multiple queries with ';')", |
|
label="Filter", |
|
) |
|
with gr.Column(scale=4): |
|
exclude_toks = gr.Textbox( |
|
placeholder="π Exclude Model-Device-OS", |
|
label="Exclude", |
|
) |
|
with gr.Row(): |
|
with gr.Column(): |
|
toks_plot = gr.Plot(container=True) |
|
demo.load( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"tokens_per_second", |
|
"Tok/s", |
|
"Tok/s Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_toks, |
|
exclude_toks, |
|
], |
|
toks_plot, |
|
) |
|
filter_toks.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"tokens_per_second", |
|
"Tok/s", |
|
"Tok/s Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_toks, |
|
exclude_toks, |
|
], |
|
toks_plot, |
|
) |
|
exclude_toks.change( |
|
lambda x, y, z: plot_metric( |
|
x, |
|
"tokens_per_second", |
|
"Tok/s", |
|
"Tok/s Over Time for Model-Device-OS Combinations", |
|
y, |
|
z, |
|
), |
|
[ |
|
gr.Dataframe(benchmark_df, visible=False), |
|
filter_toks, |
|
exclude_toks, |
|
], |
|
toks_plot, |
|
) |
|
|
|
|
|
with gr.TabItem("Device Support", elem_id="device_support", id=6): |
|
|
|
gr.Markdown( |
|
""" |
|
## Device Support |
|
|
|
This tab shows **test results for SKUs that we actually attempted to test**. It tells you whether tests passed, failed, or couldn't be completed for the devices we tried to run tests on. |
|
|
|
### Please Note: |
|
**This tab only shows devices we attempted to test** - it doesn't show the full universe of available devices. |
|
|
|
**π For comprehensive coverage analysis**, see the **Test Coverage** tab which shows ALL available SKUs. |
|
""", |
|
elem_classes="markdown-text" |
|
) |
|
|
|
|
|
support_data = pd.read_csv( |
|
f"dashboard_data/support_data_{releases[-1][:7]}.csv" |
|
) |
|
support_data.set_index(support_data.columns[0], inplace=True) |
|
support_data["Model"] = support_data["Model"].apply( |
|
lambda x: x.replace("_", "/") |
|
) |
|
support_data["Model"] = support_data["Model"].apply( |
|
lambda x: make_model_name_clickable_link(x) |
|
) |
|
support_data = ( |
|
support_data.assign(model_len=support_data["Model"].str.len()) |
|
.sort_values( |
|
by=["model_len"], |
|
ascending=[True], |
|
) |
|
.drop(columns=["model_len"]) |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Row(): |
|
with gr.Column(scale=6, elem_id="filter_models_column"): |
|
filter_support_models = gr.Textbox( |
|
placeholder="π Filter Model (separate multiple queries with ';')", |
|
label="Filter Models", |
|
) |
|
with gr.Column(scale=4, elem_classes="exclude_models_column"): |
|
exclude_support_models = gr.Textbox( |
|
placeholder="π Exclude Model", |
|
label="Exclude Model", |
|
) |
|
with gr.Row(): |
|
with gr.Accordion("See All Columns", open=False): |
|
with gr.Row(): |
|
with gr.Column(scale=9): |
|
support_shown_columns = gr.CheckboxGroup( |
|
choices=support_data.columns.tolist()[ |
|
1: |
|
], |
|
value=support_data.columns.tolist()[1:], |
|
label="Toggle Columns", |
|
elem_id="support-column-select", |
|
interactive=True, |
|
) |
|
with gr.Column(scale=1, min_width=200): |
|
with gr.Row(): |
|
select_all_support_button = gr.Button( |
|
"Select All", |
|
elem_id="select-all-support-button", |
|
interactive=True, |
|
) |
|
deselect_all_support_button = gr.Button( |
|
"Deselect All", |
|
elem_id="deselect-all-support-button", |
|
interactive=True, |
|
) |
|
|
|
with gr.Column(): |
|
gr.Markdown( |
|
""" |
|
### Legend |
|
- β
Supported: The model is supported and tested on this device. |
|
- β οΈ Failed: Either the model tests failed on this device or the Speed Factor for the test is less than 1. |
|
- ? Not Tested: The model is supported on this device but no test information available. |
|
- Not Supported: The model is not supported on this device as per the [WhisperKit configuration](https://huggingface.co/argmaxinc/whisperkit-coreml/blob/main/config.json). |
|
""" |
|
) |
|
|
|
|
|
device_support_table = gr.Dataframe( |
|
value=support_data, |
|
headers=support_data.columns.tolist(), |
|
datatype=["html" for _ in support_data.columns], |
|
elem_id="device-support-table", |
|
elem_classes="large-table", |
|
interactive=False, |
|
) |
|
|
|
|
|
hidden_support_df = gr.Dataframe(value=support_data, visible=False) |
|
|
|
def filter_support_data(df, columns, model_query, exclude_models): |
|
""" |
|
Filters the device support data based on specified criteria. |
|
|
|
:param df: The DataFrame to be filtered |
|
:param columns: Columns to include in the output |
|
:param model_query: Query string to filter models |
|
:param exclude_models: Models to exclude |
|
:return: Filtered DataFrame |
|
""" |
|
filtered_df = df.copy() |
|
|
|
|
|
if model_query: |
|
filtered_df = filtered_df[ |
|
filtered_df["Model"].str.contains( |
|
"|".join(q.strip() for q in model_query.split(";")), |
|
case=False, |
|
regex=True, |
|
) |
|
] |
|
|
|
|
|
if exclude_models: |
|
exclude_list = [ |
|
re.escape(m.strip()) for m in exclude_models.split(";") |
|
] |
|
filtered_df = filtered_df[ |
|
~filtered_df["Model"].str.contains( |
|
"|".join(exclude_list), case=False, regex=True |
|
) |
|
] |
|
|
|
|
|
selected_columns = ["Model"] + [ |
|
col for col in columns if col in df.columns |
|
] |
|
filtered_df = filtered_df[selected_columns] |
|
|
|
return filtered_df |
|
|
|
def select_all_support_columns(release): |
|
""" |
|
Returns all current columns from the support shown columns. |
|
|
|
:param release: Selected release hash |
|
:return: List of all available choices |
|
""" |
|
|
|
support_data = pd.read_csv( |
|
f"dashboard_data/support_data_{release[:7]}.csv" |
|
) |
|
support_data.set_index(support_data.columns[0], inplace=True) |
|
|
|
return [col for col in support_data.columns if col != "Model"] |
|
|
|
def deselect_all_support_columns(): |
|
return [] |
|
|
|
|
|
select_all_support_button.click( |
|
select_all_support_columns, |
|
inputs=[release_dropdown], |
|
outputs=support_shown_columns, |
|
) |
|
deselect_all_support_button.click( |
|
deselect_all_support_columns, |
|
inputs=[], |
|
outputs=support_shown_columns, |
|
) |
|
|
|
|
|
release_dropdown.change( |
|
update_support_table, |
|
inputs=[release_dropdown], |
|
outputs=[ |
|
device_support_table, |
|
support_shown_columns, |
|
hidden_support_df, |
|
], |
|
).then( |
|
filter_support_data, |
|
inputs=[ |
|
hidden_support_df, |
|
support_shown_columns, |
|
filter_support_models, |
|
exclude_support_models, |
|
], |
|
outputs=device_support_table, |
|
) |
|
|
|
|
|
for input_elem in [ |
|
filter_support_models, |
|
exclude_support_models, |
|
support_shown_columns, |
|
]: |
|
input_elem.change( |
|
filter_support_data, |
|
inputs=[ |
|
hidden_support_df, |
|
support_shown_columns, |
|
filter_support_models, |
|
exclude_support_models, |
|
], |
|
outputs=device_support_table, |
|
) |
|
|
|
|
|
with gr.TabItem("Test Coverage", elem_id="test_coverage", id=7): |
|
|
|
gr.Markdown( |
|
""" |
|
## Test Coverage |
|
|
|
This tab shows **ALL available SKUs** and our testing coverage across the entire device ecosystem. Uses chip-based expansion where testing one device covers all devices with the same chip. |
|
""", |
|
elem_classes="markdown-text" |
|
) |
|
|
|
def load_coverage_data(release): |
|
"""Load test coverage data for a specific release.""" |
|
try: |
|
with open(f"dashboard_data/test_coverage_{release}.json", "r") as f: |
|
return json.load(f) |
|
except FileNotFoundError: |
|
return { |
|
"commit_hash": release, |
|
"total_devices": 0, |
|
"tested_devices": 0, |
|
"skipped_devices": 0, |
|
"coverage_percentage": 0.0, |
|
"tested_device_list": [], |
|
"skipped_device_list": [], |
|
"tested_os_versions": [], |
|
"has_target_os_coverage": False, |
|
"covered_target_versions": [], |
|
"missing_target_versions": [], |
|
} |
|
|
|
def format_coverage_devices(device_list): |
|
"""Convert device list to DataFrame format.""" |
|
if not device_list: |
|
return pd.DataFrame(columns=["Device"]) |
|
|
|
df = pd.DataFrame({"Device": device_list}) |
|
return df.sort_values(["Device"]) |
|
|
|
def update_coverage_data(release): |
|
"""Update coverage data when release changes.""" |
|
coverage_data = load_coverage_data(release) |
|
|
|
|
|
tested_df = format_coverage_devices(coverage_data["tested_device_list"]) |
|
skipped_df = format_coverage_devices( |
|
coverage_data["skipped_device_list"] |
|
) |
|
|
|
|
|
target_os_status = "" |
|
covered_versions = coverage_data.get("covered_target_versions", []) |
|
missing_versions = coverage_data.get("missing_target_versions", []) |
|
|
|
if covered_versions or missing_versions: |
|
target_os_status = "\n- **Target OS Coverage**:\n" |
|
if covered_versions: |
|
unique_versions = sorted(set(covered_versions)) |
|
target_os_status += f" - β
**Tested**: {', '.join(unique_versions)}\n" |
|
if missing_versions: |
|
target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" |
|
|
|
|
|
coverage_summary = f"""## Test Coverage Summary for Release {release} (v{SHA_TO_VERSION.get(release, 'Unknown')}) |
|
|
|
- **Total Devices**: {coverage_data['total_devices']} |
|
- **Tested Devices**: {coverage_data['tested_devices']} |
|
- **Skipped Devices**: {coverage_data['skipped_devices']} |
|
- **Coverage Percentage**: {coverage_data['coverage_percentage']:.1f}% |
|
{target_os_status}""" |
|
|
|
return ( |
|
gr.update(value=coverage_summary), |
|
gr.update(value=tested_df), |
|
gr.update(value=skipped_df), |
|
tested_df, |
|
skipped_df, |
|
) |
|
|
|
def filter_coverage_devices(df, device_query, exclude_devices): |
|
"""Filter coverage devices based on device queries.""" |
|
if df is None or df.empty: |
|
return df |
|
|
|
filtered_df = df.copy() |
|
|
|
|
|
if device_query: |
|
filtered_df = filtered_df[ |
|
filtered_df["Device"].str.contains( |
|
"|".join(q.strip() for q in device_query.split(";")), |
|
case=False, |
|
regex=True, |
|
) |
|
] |
|
|
|
|
|
if exclude_devices: |
|
exclude_list = [ |
|
re.escape(d.strip()) for d in exclude_devices.split(";") |
|
] |
|
filtered_df = filtered_df[ |
|
~filtered_df["Device"].str.contains( |
|
"|".join(exclude_list), case=False, regex=True |
|
) |
|
] |
|
|
|
return filtered_df |
|
|
|
|
|
initial_coverage = load_coverage_data(releases[-1]) |
|
initial_tested_df = format_coverage_devices( |
|
initial_coverage["tested_device_list"] |
|
) |
|
initial_skipped_df = format_coverage_devices( |
|
initial_coverage["skipped_device_list"] |
|
) |
|
|
|
|
|
initial_target_os_status = "" |
|
covered_versions = initial_coverage.get("covered_target_versions", []) |
|
missing_versions = initial_coverage.get("missing_target_versions", []) |
|
|
|
if covered_versions or missing_versions: |
|
initial_target_os_status = "\n- **Target OS Coverage**:\n" |
|
if covered_versions: |
|
unique_versions = sorted(set(covered_versions)) |
|
initial_target_os_status += f" - β
**Tested**: {', '.join(unique_versions)}\n" |
|
if missing_versions: |
|
initial_target_os_status += f" - β **Missing**: {', '.join(missing_versions)}" |
|
|
|
|
|
initial_summary_content = f"""## Test Coverage Summary for Release {releases[-1]} (v{SHA_TO_VERSION.get(releases[-1], 'Unknown')}) |
|
|
|
- **Total Devices**: {initial_coverage['total_devices']} |
|
- **Tested Devices**: {initial_coverage['tested_devices']} |
|
- **Skipped Devices**: {initial_coverage['skipped_devices']} |
|
- **Coverage Percentage**: {initial_coverage['coverage_percentage']:.1f}% |
|
{initial_target_os_status}""" |
|
|
|
|
|
coverage_summary_text = gr.Markdown( |
|
value=initial_summary_content, |
|
elem_classes="markdown-text" |
|
) |
|
|
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
with gr.Row(): |
|
with gr.Column(scale=6): |
|
filter_coverage_devices_input = gr.Textbox( |
|
placeholder="π Filter Device (separate multiple queries with ';')", |
|
label="Filter Devices", |
|
) |
|
with gr.Column(scale=4): |
|
exclude_coverage_devices_input = gr.Textbox( |
|
placeholder="π Exclude Device", |
|
label="Exclude Device", |
|
) |
|
|
|
|
|
with gr.Tabs(): |
|
with gr.TabItem("Tested Devices", id=0): |
|
tested_devices_table = gr.Dataframe( |
|
value=initial_tested_df, |
|
headers=["Device"], |
|
datatype=["str"], |
|
elem_id="tested-devices-table", |
|
elem_classes="large-table", |
|
interactive=False, |
|
) |
|
|
|
with gr.TabItem("Skipped Devices", id=1): |
|
skipped_devices_table = gr.Dataframe( |
|
value=initial_skipped_df, |
|
headers=["Device"], |
|
datatype=["str"], |
|
elem_id="skipped-devices-table", |
|
elem_classes="large-table", |
|
interactive=False, |
|
) |
|
|
|
|
|
hidden_tested_df = gr.Dataframe(value=initial_tested_df, visible=False) |
|
hidden_skipped_df = gr.Dataframe(value=initial_skipped_df, visible=False) |
|
|
|
|
|
release_dropdown.change( |
|
update_coverage_data, |
|
inputs=[release_dropdown], |
|
outputs=[ |
|
coverage_summary_text, |
|
tested_devices_table, |
|
skipped_devices_table, |
|
hidden_tested_df, |
|
hidden_skipped_df, |
|
], |
|
queue=False, |
|
) |
|
|
|
|
|
for input_elem in [ |
|
filter_coverage_devices_input, |
|
exclude_coverage_devices_input, |
|
]: |
|
input_elem.change( |
|
lambda tested_df, skipped_df, device_query, exclude_devices: ( |
|
filter_coverage_devices( |
|
tested_df, device_query, exclude_devices |
|
), |
|
filter_coverage_devices( |
|
skipped_df, device_query, exclude_devices |
|
), |
|
), |
|
inputs=[ |
|
hidden_tested_df, |
|
hidden_skipped_df, |
|
filter_coverage_devices_input, |
|
exclude_coverage_devices_input, |
|
], |
|
outputs=[tested_devices_table, skipped_devices_table], |
|
) |
|
|
|
|
|
with gr.TabItem("Methodology", elem_id="methodology", id=8): |
|
gr.Markdown(METHODOLOGY_TEXT, elem_id="methodology-text") |
|
|
|
|
|
with gr.Accordion("π Citation", open=False): |
|
citation_button = gr.Textbox( |
|
value=CITATION_BUTTON_TEXT, |
|
label=CITATION_BUTTON_LABEL, |
|
lines=7, |
|
elem_id="citation-button", |
|
show_copy_button=True, |
|
) |
|
|
|
|
|
demo.launch(debug=True, share=True, ssr_mode=False) |
|
|