Spaces:

amu-cai
/

pl-asr-leaderboard

Running

App Files Files Community

mj-new commited on Oct 30

Commit

37d493c

•

1 Parent(s): ceb2b55

Updated leaderboard code and requirements

Browse files

Files changed (4) hide show

app.py +167 -52
constants.py +2 -1
requirements.txt +1 -1
utils.py +87 -15

app.py CHANGED Viewed

@@ -2,10 +2,12 @@ import os
 import streamlit as st
 import pandas as pd
 from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
-from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
 from app_utils import calculate_height_to_display, filter_dataframe
 import matplotlib.pyplot as plt
 import numpy as np
 hf_token = os.getenv('HF_TOKEN')
 if hf_token is None:
@@ -185,7 +187,7 @@ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='o
     st.pyplot(fig)
 with about:
-    st.title("About the Polish ASR Leaderboard")
     st.markdown(ABOUT_INFO, unsafe_allow_html=True)
     # Table - evaluated systems # TODO - change to concatenated table
@@ -196,6 +198,13 @@ with about:
     #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
     df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
     codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
     #print(codename_to_shortname_mapping)
@@ -203,14 +212,32 @@ with about:
     df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
     df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
-    st.header("Number of systems evaluated (freely and commercially available)")
     st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
-    st.header("Detalied info about evaluated ASR systems")
     #TODO - add info who created the system (company, institution, team, etc.)
-    st.dataframe(df_evaluated_systems, hide_index=True, height = h_df_systems, use_container_width=True)
     # Table - evaluation datasets
     # Table - evaluation metrics
@@ -223,6 +250,8 @@ with about:
     # List - TODOs
 with lead_bigos:
     # configuration for tab
     dataset = "amu-cai/pl-asr-bigos-v2-secret"
@@ -257,17 +286,17 @@ with lead_bigos:
     # save sample to tsv
     df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
     # MOST IMPORTANT RESULTS
     analysis_dim = "system"
     metric = "WER"
-    st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
-    fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
-    ########### EVALUATION PARAMETERS PRESENTATION ################
-    st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
-    st.markdown(BIGOS_INFO, unsafe_allow_html=True)
     st.markdown("**Evaluation date:** {}".format(eval_date))
     st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
     st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
@@ -301,7 +330,6 @@ with lead_bigos:
     h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     ##################### PER SUBSET ANALYSIS #########################
     analysis_dim = "subset"
     metric = "WER"
@@ -311,7 +339,7 @@ with lead_bigos:
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
-    fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
     ### IMPACT OF NORMALIZATION ON ERROR RATES #####
@@ -395,16 +423,14 @@ with lead_pelcra:
     df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
-    # MOST IMPORTANT RESULTS
     analysis_dim = "system"
     metric = "WER"
-    st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
     fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
-    ########### EVALUATION PARAMETERS PRESENTATION ################
-    st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
-    st.markdown(BIGOS_INFO, unsafe_allow_html=True)
     st.markdown("**Evaluation date:** {}".format(eval_date))
     st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
     st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
@@ -447,7 +473,7 @@ with lead_pelcra:
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
-    fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
     ### IMPACT OF NORMALIZATION ON ERROR RATES #####
@@ -502,6 +528,13 @@ with analysis:
     dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'),  key="select_dataset_scenarios")
     # read the latest results for the selected dataset
     print("Reading the latest results for dataset: ", dataset)
     df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
@@ -547,7 +580,7 @@ with analysis:
     st.subheader("Best and worst systems for dataset {}".format(dataset))
     df_best_worse_systems = pd.DataFrame(data, columns=header)
     # do not display index
-    st.dataframe(df_best_worse_systems)
     st.subheader("Comparison of average WER for best systems")
     df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
@@ -602,21 +635,74 @@ with analysis:
     # Y is thw average WER
     # make each point a different color
     # provide legend with system names
-    fig, ax = plt.subplots()
     for system in free_systems_wer['system'].unique():
         subset = free_systems_wer[free_systems_wer['system'] == system]
-        ax.scatter(subset['Parameters [M]'], subset['WER'], label=system)
-        # Add text annotation for each point
         for i, point in subset.iterrows():
-            ax.annotate(point['system'], (point['Parameters [M]'], point['WER']), textcoords="offset points", xytext=(-10,-10), ha='left', rotation=-30, fontsize=5)
-    ax.set_xlabel('Model Size [M]')
-    ax.set_ylabel('WER (%)')
-    ax.set_title('WER in function of model size')
-    # decrease font size of the legend and place it outside the plot
-    ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
     st.pyplot(fig)
     ##################################################################################################################################################
     # WER per audio duration
@@ -653,11 +739,7 @@ with analysis:
     # print dataframe in streamlit
     st.dataframe(df_per_sample_wer_audio_pivot)
-    # plot scatter plot with values from df_per_sample_wer_pivot.
-    # each system should have a different color
-    # the size of the point should be proportional to the number of samples in the bucket
-    # the x axis should be the audio duration bucket
-    # the y axis should be the average WER
     fig, ax = plt.subplots()
     for system in selected_systems:
         subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
@@ -678,7 +760,7 @@ with analysis:
     audio_feature_to_analyze = 'speech_rate_words'
     audio_feature_unit = ' [words/s]'
     metric = 'WER'
-    metric_unit = ' [%]'
     no_of_buckets = 10
     # calculate average WER per audio duration bucket for the best and worse commercial and free systems
     selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
@@ -688,24 +770,57 @@ with analysis:
     # print dataframe in streamlit
     st.dataframe(df_per_sample_wer_feature_pivot)
-    # plot scatter plot with values from df_per_sample_wer_pivot.
-    # each system should have a different color
-    # the size of the point should be proportional to the number of samples in the bucket
-    # the x axis should be the audio duration bucket
-    # the y axis should be the average WER
-    fig, ax = plt.subplots()
-    for system in selected_systems:
-        subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
-        ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
-    ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
-    ax.set_ylabel(metric  + metric_unit)
-    ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
-    # place legend outside the plot on the right
-    ax.legend(title='System', loc='best')
     st.pyplot(fig)
     ################################################################################################################################################
     # WER PER GENDER

 import streamlit as st
 import pandas as pd
 from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
+from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
 from app_utils import calculate_height_to_display, filter_dataframe
 import matplotlib.pyplot as plt
 import numpy as np
+import statsmodels.api as sm
+import seaborn as sns
 hf_token = os.getenv('HF_TOKEN')
 if hf_token is None:
     st.pyplot(fig)
 with about:
+    st.title("AMU Polish ASR Leaderboard")
     st.markdown(ABOUT_INFO, unsafe_allow_html=True)
     # Table - evaluated systems # TODO - change to concatenated table
     #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
     df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
+    # drop columns "Included in BIGOS benchmark"
+    df_evaluated_systems = df_evaluated_systems.drop(columns=["Included in BIGOS benchmark"])
+    # drop empty rows
+    df_evaluated_systems = df_evaluated_systems.dropna(how='all')
+    # drop empty columns
+    df_evaluated_systems = df_evaluated_systems.dropna(axis=1, how='all')
     codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
     #print(codename_to_shortname_mapping)
     df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
     df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
+    st.subheader("Evaluated systems:")
     st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
     #TODO - add info who created the system (company, institution, team, etc.)
+    # Split into separate tables for free and commercial systems
+    free_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'free']
+    commercial_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'commercial']
+    st.subheader("Free systems:")
+    # drop empty columns
+    free_systems = free_systems.dropna(axis=1, how='all')
+    # drop empty rows
+    free_systems = free_systems.dropna(how='all')
+    # do not display index
+    st.dataframe(free_systems, hide_index=True, height = h_df_systems, use_container_width=True)
+    st.subheader("Commercial systems:")
+    # drop empty columns
+    commercial_systems = commercial_systems.dropna(axis=1, how='all')
+    # do not display index
+    # drop empty rows
+    commercial_systems = commercial_systems.dropna(how='all')
+    st.dataframe(commercial_systems, hide_index=True, height = h_df_systems, use_container_width=True)
     # Table - evaluation datasets
     # Table - evaluation metrics
     # List - TODOs
 with lead_bigos:
+    st.title("BIGOS Leaderboard")
+    st.markdown(BIGOS_INFO, unsafe_allow_html=True)
     # configuration for tab
     dataset = "amu-cai/pl-asr-bigos-v2-secret"
     # save sample to tsv
     df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
+    ########### EVALUATION PARAMETERS PRESENTATION ################
+    st.title("ASR leaderboard for dataset: {} {}".format(dataset_short_name, dataset_version))
     # MOST IMPORTANT RESULTS
     analysis_dim = "system"
     metric = "WER"
+    st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
+    fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim, metric + "[%]","System", "Type")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
+    st.header("Benchmark details")
     st.markdown("**Evaluation date:** {}".format(eval_date))
     st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
     st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
     h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     ##################### PER SUBSET ANALYSIS #########################
     analysis_dim = "subset"
     metric = "WER"
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
+    fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
     ### IMPACT OF NORMALIZATION ON ERROR RATES #####
     df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
+     # MOST IMPORTANT RESULTS
     analysis_dim = "system"
     metric = "WER"
+    st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
     fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
+    st.header("Benchmark details")
     st.markdown("**Evaluation date:** {}".format(eval_date))
     st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
     st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
     st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
     st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
+    fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
     st.pyplot(fig, clear_figure=True, use_container_width=True)
     ### IMPACT OF NORMALIZATION ON ERROR RATES #####
     dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'),  key="select_dataset_scenarios")
+    if dataset == "amu-cai/pl-asr-bigos-v2-secret":
+        dataset_short_name = "BIGOS"
+    elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
+        dataset_short_name = "PELCRA"
+    else:
+        dataset_short_name = "UNKNOWN"
     # read the latest results for the selected dataset
     print("Reading the latest results for dataset: ", dataset)
     df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
     st.subheader("Best and worst systems for dataset {}".format(dataset))
     df_best_worse_systems = pd.DataFrame(data, columns=header)
     # do not display index
+    st.dataframe(df_best_worse_systems, hide_index=True)
     st.subheader("Comparison of average WER for best systems")
     df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
     # Y is thw average WER
     # make each point a different color
     # provide legend with system names
+    fig, ax = plt.subplots(figsize=(10, 7))
+    # Define larger jitter for close points
+    jitter_x = 5
+    jitter_y = 0.2
+    # Alternate marker shapes to distinguish overlapping points
+    marker_styles = ['o', 's', 'D', '^', 'v', '<', '>']  # Circle, square, diamond, and other shapes
+    marker_dict = {system: marker_styles[i % len(marker_styles)] for i, system in enumerate(free_systems_wer['system'].unique())}
     for system in free_systems_wer['system'].unique():
         subset = free_systems_wer[free_systems_wer['system'] == system]
+        marker_style = marker_dict[system]
+        # Scatter plot with distinct marker shapes for each system
+        ax.scatter(
+            subset['Parameters [M]'] + jitter_x * (np.random.rand(len(subset)) - 0.5),  # Apply jitter to x for overlap
+            subset['WER'] + jitter_y * (np.random.rand(len(subset)) - 0.5),  # Apply jitter to y for overlap
+            label=system, s=100, alpha=0.7, edgecolor='black', marker=marker_style
+        )
+        # Add text annotations with dynamic positioning to avoid overlap with y-axis
         for i, point in subset.iterrows():
+            # Adjust position to avoid overlap with y-axis
+            x_offset = 10 if point['Parameters [M]'] < 50 else -10 if i % 2 == 1 else 10  # Push right if close to y-axis
+            y_offset = -0.5 if i % 2 == 0 else 0.5  # Alternate vertical offset
+            ax.annotate(
+                point['system'],
+                (point['Parameters [M]'], point['WER']),
+                textcoords="offset points",
+                xytext=(x_offset, y_offset),
+                ha='right' if x_offset < 0 else 'left',
+                fontsize=10,
+                bbox=dict(boxstyle="round,pad=0.3", edgecolor='white', facecolor='white', alpha=0.7)
+            )
+    # Set axis labels and title
+    ax.set_xlabel('Model Size [M Parameters]', fontsize=12)
+    ax.set_ylabel('WER (%)', fontsize=12)
+    ax.set_title(f'WER vs. Model Size for Dataset {dataset_short_name}', fontsize=14, pad=20)
+    # Adjust legend settings to fit outside the main plot area
+    ax.legend(
+        title='System', bbox_to_anchor=(0.8, 1), loc='upper left',
+        fontsize=8, title_fontsize=9, frameon=True, shadow=False, facecolor='white')
+    #)
+    # Add grid lines and minor ticks for better readability
+    ax.grid(True, linestyle='--', alpha=0.5)
+    ax.minorticks_on()
+    ax.tick_params(which='both', direction='in', top=True, right=True)
+    # increase granularity of y-axis to 20 points per whole range
+    # Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
+    y_min = 0
+    y_max = ax.get_ylim()[1]  # Get the current maximum y value
+    y_max_rounded = np.ceil(y_max / 5) * 5  # Round y_max up to the next highest multiple of 5
+    ax.set_ylim(y_min, y_max_rounded)
+    # Improve layout spacing
+    plt.tight_layout()
+    # Display the plot
     st.pyplot(fig)
     ##################################################################################################################################################
     # WER per audio duration
     # print dataframe in streamlit
     st.dataframe(df_per_sample_wer_audio_pivot)
+    # create scatter plot with WER in function of audio duration
     fig, ax = plt.subplots()
     for system in selected_systems:
         subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
     audio_feature_to_analyze = 'speech_rate_words'
     audio_feature_unit = ' [words/s]'
     metric = 'WER'
+    metric_unit = ' (%)'
     no_of_buckets = 10
     # calculate average WER per audio duration bucket for the best and worse commercial and free systems
     selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
     # print dataframe in streamlit
     st.dataframe(df_per_sample_wer_feature_pivot)
+    # Set a threshold to remove outliers - here we use the 97th percentile of WER
+    threshold = df_per_sample_wer_feature[metric].quantile(0.97)
+    # Remove data points with WER greater than the threshold
+    filtered_df = df_per_sample_wer_feature[df_per_sample_wer_feature[metric] <= threshold]
+    # Create figure and axis with larger size
+    fig, ax = plt.subplots(figsize=(10, 7))
+    # Scatter plot for each system
+    for system in selected_systems:
+        subset = filtered_df[filtered_df['system'] == system]
+        ax.scatter(subset[audio_feature_to_analyze],
+                subset[metric],
+                label=system,
+                s=subset['number_of_samples'] * 0.5,
+                alpha=0.6)  # Set alpha for better visibility of overlapping points
+        # Adding a trend line using LOWESS
+        lowess = sm.nonparametric.lowess
+        trend = lowess(subset[metric], subset[audio_feature_to_analyze], frac=0.3)  # Adjust frac to control smoothing
+        ax.plot(trend[:, 0], trend[:, 1], label=f'{system} Trend', linestyle='-', linewidth=2)
+    # Set axis labels with improved formatting for readability
+    ax.set_xlabel(audio_feature_to_analyze.replace('_', ' ').capitalize() + ' ' +  audio_feature_unit )
+    ax.set_ylabel(metric + ' ' + metric_unit )
+    # Set an improved title that is more informative
+    ax.set_title('Word Error Rate (WER) vs Speech Rate\nBest Performing Free and Paid Systems', fontsize=14)
+    # increase granularity of y-axis to 20 points per whole range
+    # Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
+    y_min = 0
+    y_max = ax.get_ylim()[1]  # Get the current maximum y value
+    y_max_rounded = np.ceil(y_max / 5) * 5  # Round y_max up to the next highest multiple of 5
+    ax.set_ylim(y_min, y_max_rounded)
+    # Add a grid to improve readability and alignment
+    ax.grid(True, linestyle='--', alpha=0.7)
+    # Place legend outside the plot area to prevent overlapping with data points
+    ax.legend(title='System', loc='upper right', bbox_to_anchor=(0.95, 1))
+    # Add tight layout to improve spacing between elements
+    fig.tight_layout()
+    # Display the plot
     st.pyplot(fig)
     ################################################################################################################################################
     # WER PER GENDER

constants.py CHANGED Viewed

@@ -1,7 +1,8 @@
 ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
 The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
 To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
-If you use this work, please use the citation below: <br> \
 ```@misc{amu_cai_pl_asr_leaderboard, \
   author       = {Michał Junczyk}, \
   title        = {{AMU Polish ASR Leaderboard}}, \

 ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
 The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
+If you want to add your system or dataset to the leaderboard, please contact Michał Junczyk (michal.junczyk@amu.edu.pl) or open a pull request on [GitHub](https://github.com/goodmike31/pl-asr-bigos-tools) <br>\
 To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
+If you use this work, please cite it as follows: <br> \
 ```@misc{amu_cai_pl_asr_leaderboard, \
   author       = {Michał Junczyk}, \
   title        = {{AMU Polish ASR Leaderboard}}, \

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	seaborn
2	-


1	seaborn
2	+ statsmodels

utils.py CHANGED Viewed

@@ -9,22 +9,10 @@ from datasets import Dataset
 from huggingface_hub import hf_hub_download
 import matplotlib.patches as mpatches
 import matplotlib as mpl
-asr_systems_colors_mapping = {
-        'azure': '#1f77b4',     # Blue
-        'google': '#2ca02c',    # Green
-        'wav2vec2': '#d62728',  # Red
-        'nemo': '#9467bd',      # Purple
-        'assemblyai': '#8c564b',  # Brown
-        'mms': '#e377c2',       # Pink
-        'google_v2': '#7f7f7f', # Gray
-        'whisper_cloud': '#bcbd22',  # Olive
-        'whisper_local': '#ff7f0e',   # Orange
-        # Add or override other systems and their colors
-    }
 def download_tsv_from_google_sheet(sheet_url):
     # Modify the Google Sheet URL to export it as TSV
     tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
@@ -181,7 +169,7 @@ def filter_bottom_outliers(df_input, metric, min_threshold):
 def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
     # Box plot for WER per dataset
-    fig, ax = plt.subplots(figsize=(20, 10))
     # generate box plot without outliers
     sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
@@ -193,6 +181,90 @@ def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
     #return figure
     return plt
 def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
     # Create a figure and axis object
     fig, ax = plt.subplots(figsize=(12, 8))

 from huggingface_hub import hf_hub_download
 import matplotlib.patches as mpatches
 import matplotlib as mpl
+from constants import asr_systems_colors_mapping
+from matplotlib.lines import Line2D
 def download_tsv_from_google_sheet(sheet_url):
     # Modify the Google Sheet URL to export it as TSV
     tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
 def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
     # Box plot for WER per dataset
+    fig, ax = plt.subplots(figsize=(12, 8))
     # generate box plot without outliers
     sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
     #return figure
     return plt
+def box_plot_per_dimension_subsets(df_input, metric, dimension, title, xlabel, ylabel, category_column, y_limit=100):
+    """
+    Plots a box plot with individual data points colored and marked by a specified category.
+    Parameters:
+    - df_input (pd.DataFrame): Input DataFrame containing data to plot.
+    - metric (str): Column name for the metric to plot on the y-axis.
+    - dimension (str): Column name for the dimension (x-axis categories).
+    - title (str): Title of the plot.
+    - xlabel (str): Label for the x-axis.
+    - ylabel (str): Label for the y-axis.
+    - category_column (str): Column name to use for differentiating data points by color and marker.
+    - y_limit (float, optional): Maximum value for the y-axis to limit extreme outliers.
+    Returns:
+    - fig: The matplotlib figure object.
+    """
+    # Set up the figure and axis with a larger size for readability
+    fig, ax = plt.subplots(figsize=(14, 8))
+    # Create a sorted order for the dimension based on the median values of the metric
+    order = df_input.groupby(dimension)[metric].median().sort_values().index
+    # Generate box plot without showing extreme outliers
+    boxplot = sns.boxplot(
+        x=dimension, y=metric, data=df_input,
+        order=order, showfliers=False, width=0.6, ax=ax,
+        color="white"
+    )
+    # Make the box plots transparent by adjusting the facecolor of each box
+    for patch in boxplot.artists:
+        patch.set_facecolor("white")
+        patch.set_alpha(0.2)  # Set transparency
+    # Define category-specific colors and marker styles
+    categories = df_input[category_column].unique()
+    markers = ['o', 's', '^', 'D', 'X', 'P', '*']  # Different marker styles
+    colors = sns.color_palette("Set2", len(categories))  # Use a color palette with distinct colors
+    category_style_map = {category: {'color': colors[i % len(colors)], 'marker': markers[i % len(markers)]}
+                          for i, category in enumerate(categories)}
+    # Overlay individual data points with category-specific colors and markers
+    for category, style in category_style_map.items():
+        # Filter data for each category
+        category_data = df_input[(df_input[category_column] == category) & (df_input[metric] <= y_limit)]
+        sns.stripplot(
+            x=dimension, y=metric, data=category_data,
+            order=order, color=style['color'], marker=style['marker'],
+            size=5, jitter=True, alpha=1, ax=ax
+        )
+    # Set title and axis labels
+    ax.set_title(title)
+    ax.set_xlabel(xlabel)
+    ax.set_ylabel(ylabel)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
+    # Add gridlines for easier comparison
+    plt.grid(axis='y', linestyle='--', alpha=0.5)
+    # Set y-axis limit to improve readability
+    # Calculate the y-axis maximum as the next multiple of 5 above the data’s max value
+    # Make sure the max value does not contain any extreme outliers. Threhold at 98th percentile
+    max_value = df_input[metric].quantile(0.99)
+    y_max = (int(max_value / 5) + 1) * 5
+    # Set y-axis ticks with evenly spaced intervals of 5
+    ax.set_yticks(range(0, y_max + 1, 5))
+    ax.set_ylim(0, y_max)
+    # Create a custom legend with unique entries for each category
+    legend_handles = [
+        Line2D([0], [0], marker=style['marker'], color='w', markerfacecolor=style['color'], markersize=8, label=category)
+        for category, style in category_style_map.items()
+    ]
+    ax.legend(handles=legend_handles, title=category_column, bbox_to_anchor=(1.05, 1), loc='upper left')
+    # Return the updated figure
+    return fig
 def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
     # Create a figure and axis object
     fig, ax = plt.subplots(figsize=(12, 8))