mj-new commited on
Commit
37d493c
1 Parent(s): ceb2b55

Updated leaderboard code and requirements

Browse files
Files changed (4) hide show
  1. app.py +167 -52
  2. constants.py +2 -1
  3. requirements.txt +1 -1
  4. utils.py +87 -15
app.py CHANGED
@@ -2,10 +2,12 @@ import os
2
  import streamlit as st
3
  import pandas as pd
4
  from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
5
- from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
  from app_utils import calculate_height_to_display, filter_dataframe
7
  import matplotlib.pyplot as plt
8
  import numpy as np
 
 
9
 
10
  hf_token = os.getenv('HF_TOKEN')
11
  if hf_token is None:
@@ -185,7 +187,7 @@ def create_radar_plot(df, enable_labels, systems, metric, norm_type, ref_type='o
185
  st.pyplot(fig)
186
 
187
  with about:
188
- st.title("About the Polish ASR Leaderboard")
189
  st.markdown(ABOUT_INFO, unsafe_allow_html=True)
190
 
191
  # Table - evaluated systems # TODO - change to concatenated table
@@ -196,6 +198,13 @@ with about:
196
  #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
197
 
198
  df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
 
 
 
 
 
 
 
199
  codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
200
  #print(codename_to_shortname_mapping)
201
 
@@ -203,14 +212,32 @@ with about:
203
 
204
  df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
205
  df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
206
- st.header("Number of systems evaluated (freely and commercially available)")
207
 
208
  st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
209
 
210
- st.header("Detalied info about evaluated ASR systems")
211
-
212
  #TODO - add info who created the system (company, institution, team, etc.)
213
- st.dataframe(df_evaluated_systems, hide_index=True, height = h_df_systems, use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
 
215
  # Table - evaluation datasets
216
  # Table - evaluation metrics
@@ -223,6 +250,8 @@ with about:
223
  # List - TODOs
224
 
225
  with lead_bigos:
 
 
226
 
227
  # configuration for tab
228
  dataset = "amu-cai/pl-asr-bigos-v2-secret"
@@ -257,17 +286,17 @@ with lead_bigos:
257
  # save sample to tsv
258
  df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
259
 
 
 
 
260
  # MOST IMPORTANT RESULTS
261
  analysis_dim = "system"
262
  metric = "WER"
263
- st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
264
- fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
265
  st.pyplot(fig, clear_figure=True, use_container_width=True)
266
 
267
-
268
- ########### EVALUATION PARAMETERS PRESENTATION ################
269
- st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
270
- st.markdown(BIGOS_INFO, unsafe_allow_html=True)
271
  st.markdown("**Evaluation date:** {}".format(eval_date))
272
  st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
273
  st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
@@ -301,7 +330,6 @@ with lead_bigos:
301
  h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
302
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
303
 
304
-
305
  ##################### PER SUBSET ANALYSIS #########################
306
  analysis_dim = "subset"
307
  metric = "WER"
@@ -311,7 +339,7 @@ with lead_bigos:
311
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
312
 
313
  st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
314
- fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
315
  st.pyplot(fig, clear_figure=True, use_container_width=True)
316
 
317
  ### IMPACT OF NORMALIZATION ON ERROR RATES #####
@@ -395,16 +423,14 @@ with lead_pelcra:
395
 
396
  df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
397
 
398
- # MOST IMPORTANT RESULTS
399
  analysis_dim = "system"
400
  metric = "WER"
401
- st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
402
  fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
403
  st.pyplot(fig, clear_figure=True, use_container_width=True)
404
-
405
- ########### EVALUATION PARAMETERS PRESENTATION ################
406
- st.title("Leaderboard for {} {}".format(dataset_short_name, dataset_version))
407
- st.markdown(BIGOS_INFO, unsafe_allow_html=True)
408
  st.markdown("**Evaluation date:** {}".format(eval_date))
409
  st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
410
  st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
@@ -447,7 +473,7 @@ with lead_pelcra:
447
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
448
 
449
  st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
450
- fig = box_plot_per_dimension(df_per_dataset, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]")
451
  st.pyplot(fig, clear_figure=True, use_container_width=True)
452
 
453
  ### IMPACT OF NORMALIZATION ON ERROR RATES #####
@@ -502,6 +528,13 @@ with analysis:
502
 
503
  dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
504
 
 
 
 
 
 
 
 
505
  # read the latest results for the selected dataset
506
  print("Reading the latest results for dataset: ", dataset)
507
  df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
@@ -547,7 +580,7 @@ with analysis:
547
  st.subheader("Best and worst systems for dataset {}".format(dataset))
548
  df_best_worse_systems = pd.DataFrame(data, columns=header)
549
  # do not display index
550
- st.dataframe(df_best_worse_systems)
551
 
552
  st.subheader("Comparison of average WER for best systems")
553
  df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
@@ -602,21 +635,74 @@ with analysis:
602
  # Y is thw average WER
603
  # make each point a different color
604
  # provide legend with system names
605
- fig, ax = plt.subplots()
 
 
 
 
 
 
 
 
 
606
  for system in free_systems_wer['system'].unique():
607
  subset = free_systems_wer[free_systems_wer['system'] == system]
608
- ax.scatter(subset['Parameters [M]'], subset['WER'], label=system)
609
- # Add text annotation for each point
 
 
 
 
 
 
 
 
610
  for i, point in subset.iterrows():
611
- ax.annotate(point['system'], (point['Parameters [M]'], point['WER']), textcoords="offset points", xytext=(-10,-10), ha='left', rotation=-30, fontsize=5)
612
- ax.set_xlabel('Model Size [M]')
613
- ax.set_ylabel('WER (%)')
614
- ax.set_title('WER in function of model size')
615
- # decrease font size of the legend and place it outside the plot
616
- ax.legend(title='System', bbox_to_anchor=(1.05, 1), loc='upper left')
617
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
618
  st.pyplot(fig)
619
 
 
620
  ##################################################################################################################################################
621
  # WER per audio duration
622
 
@@ -653,11 +739,7 @@ with analysis:
653
  # print dataframe in streamlit
654
  st.dataframe(df_per_sample_wer_audio_pivot)
655
 
656
- # plot scatter plot with values from df_per_sample_wer_pivot.
657
- # each system should have a different color
658
- # the size of the point should be proportional to the number of samples in the bucket
659
- # the x axis should be the audio duration bucket
660
- # the y axis should be the average WER
661
  fig, ax = plt.subplots()
662
  for system in selected_systems:
663
  subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
@@ -678,7 +760,7 @@ with analysis:
678
  audio_feature_to_analyze = 'speech_rate_words'
679
  audio_feature_unit = ' [words/s]'
680
  metric = 'WER'
681
- metric_unit = ' [%]'
682
  no_of_buckets = 10
683
  # calculate average WER per audio duration bucket for the best and worse commercial and free systems
684
  selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
@@ -688,24 +770,57 @@ with analysis:
688
  # print dataframe in streamlit
689
  st.dataframe(df_per_sample_wer_feature_pivot)
690
 
691
- # plot scatter plot with values from df_per_sample_wer_pivot.
692
- # each system should have a different color
693
- # the size of the point should be proportional to the number of samples in the bucket
694
- # the x axis should be the audio duration bucket
695
- # the y axis should be the average WER
696
- fig, ax = plt.subplots()
697
- for system in selected_systems:
698
- subset = df_per_sample_wer_feature[df_per_sample_wer_feature['system'] == system]
699
- ax.scatter(subset[audio_feature_to_analyze], subset[metric], label=system, s=subset['number_of_samples']*0.5)
700
- ax.set_xlabel(audio_feature_to_analyze.replace('_',' ').capitalize() + audio_feature_unit)
701
- ax.set_ylabel(metric + metric_unit)
702
- ax.set_title('WER in function of speech rate.'.format(audio_feature_to_analyze))
703
 
704
- # place legend outside the plot on the right
705
- ax.legend(title='System', loc='best')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
706
  st.pyplot(fig)
707
 
708
 
 
709
  ################################################################################################################################################
710
  # WER PER GENDER
711
 
 
2
  import streamlit as st
3
  import pandas as pd
4
  from constants import BIGOS_INFO, PELCRA_INFO, ANALYSIS_INFO, ABOUT_INFO, INSPECTION_INFO, COMPARISON_INFO
5
+ from utils import read_latest_results, basic_stats_per_dimension, retrieve_asr_systems_meta_from_the_catalog, box_plot_per_dimension,box_plot_per_dimension_subsets, box_plot_per_dimension_with_colors, get_total_audio_duration, check_impact_of_normalization, calculate_wer_per_meta_category, calculate_wer_per_audio_feature
6
  from app_utils import calculate_height_to_display, filter_dataframe
7
  import matplotlib.pyplot as plt
8
  import numpy as np
9
+ import statsmodels.api as sm
10
+ import seaborn as sns
11
 
12
  hf_token = os.getenv('HF_TOKEN')
13
  if hf_token is None:
 
187
  st.pyplot(fig)
188
 
189
  with about:
190
+ st.title("AMU Polish ASR Leaderboard")
191
  st.markdown(ABOUT_INFO, unsafe_allow_html=True)
192
 
193
  # Table - evaluated systems # TODO - change to concatenated table
 
198
  #print("ASR systems available in the eval results for dataset {}: ".format(dataset), evaluated_systems_list )
199
 
200
  df_evaluated_systems = retrieve_asr_systems_meta_from_the_catalog(evaluated_systems_list)
201
+ # drop columns "Included in BIGOS benchmark"
202
+ df_evaluated_systems = df_evaluated_systems.drop(columns=["Included in BIGOS benchmark"])
203
+ # drop empty rows
204
+ df_evaluated_systems = df_evaluated_systems.dropna(how='all')
205
+ # drop empty columns
206
+ df_evaluated_systems = df_evaluated_systems.dropna(axis=1, how='all')
207
+
208
  codename_to_shortname_mapping = dict(zip(df_evaluated_systems["Codename"],df_evaluated_systems["Shortname"]))
209
  #print(codename_to_shortname_mapping)
210
 
 
212
 
213
  df_evaluated_systems_types_and_count = df_evaluated_systems["Type"].value_counts().reset_index()
214
  df_evaluated_systems_types_and_count.columns = ["Type", "Count"]
215
+ st.subheader("Evaluated systems:")
216
 
217
  st.dataframe(df_evaluated_systems_types_and_count, hide_index=True, use_container_width=False)
218
 
 
 
219
  #TODO - add info who created the system (company, institution, team, etc.)
220
+ # Split into separate tables for free and commercial systems
221
+ free_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'free']
222
+ commercial_systems = df_evaluated_systems[df_evaluated_systems['Type'] == 'commercial']
223
+
224
+ st.subheader("Free systems:")
225
+ # drop empty columns
226
+ free_systems = free_systems.dropna(axis=1, how='all')
227
+ # drop empty rows
228
+ free_systems = free_systems.dropna(how='all')
229
+
230
+ # do not display index
231
+ st.dataframe(free_systems, hide_index=True, height = h_df_systems, use_container_width=True)
232
+
233
+ st.subheader("Commercial systems:")
234
+ # drop empty columns
235
+ commercial_systems = commercial_systems.dropna(axis=1, how='all')
236
+ # do not display index
237
+ # drop empty rows
238
+ commercial_systems = commercial_systems.dropna(how='all')
239
+
240
+ st.dataframe(commercial_systems, hide_index=True, height = h_df_systems, use_container_width=True)
241
 
242
  # Table - evaluation datasets
243
  # Table - evaluation metrics
 
250
  # List - TODOs
251
 
252
  with lead_bigos:
253
+ st.title("BIGOS Leaderboard")
254
+ st.markdown(BIGOS_INFO, unsafe_allow_html=True)
255
 
256
  # configuration for tab
257
  dataset = "amu-cai/pl-asr-bigos-v2-secret"
 
286
  # save sample to tsv
287
  df_per_dataset_with_asr_systems_meta.sample(5).to_csv("sample.tsv", sep="\t", index=False)
288
 
289
+ ########### EVALUATION PARAMETERS PRESENTATION ################
290
+ st.title("ASR leaderboard for dataset: {} {}".format(dataset_short_name, dataset_version))
291
+
292
  # MOST IMPORTANT RESULTS
293
  analysis_dim = "system"
294
  metric = "WER"
295
+ st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
296
+ fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim, metric + "[%]","System", "Type")
297
  st.pyplot(fig, clear_figure=True, use_container_width=True)
298
 
299
+ st.header("Benchmark details")
 
 
 
300
  st.markdown("**Evaluation date:** {}".format(eval_date))
301
  st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
302
  st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
 
330
  h_df_per_system_per_dataset = calculate_height_to_display(df_wer_per_system_from_per_dataset)
331
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
332
 
 
333
  ##################### PER SUBSET ANALYSIS #########################
334
  analysis_dim = "subset"
335
  metric = "WER"
 
339
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
340
 
341
  st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
342
+ fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
343
  st.pyplot(fig, clear_figure=True, use_container_width=True)
344
 
345
  ### IMPACT OF NORMALIZATION ON ERROR RATES #####
 
423
 
424
  df_per_dataset_with_asr_systems_meta = pd.merge(df_per_dataset, df_evaluated_systems, how="left", left_on="system", right_on="Shortname")
425
 
426
+ # MOST IMPORTANT RESULTS
427
  analysis_dim = "system"
428
  metric = "WER"
429
+ st.subheader("Leaderboard - Median {} per ASR {} across all subsets of {} dataset".format(metric, analysis_dim, dataset_short_name))
430
  fig = box_plot_per_dimension_with_colors(df_per_dataset_with_asr_systems_meta, metric, analysis_dim, "{} per {}".format(metric, analysis_dim), analysis_dim, metric + "[%]","System", "Type")
431
  st.pyplot(fig, clear_figure=True, use_container_width=True)
432
+
433
+ st.header("Benchmark details")
 
 
434
  st.markdown("**Evaluation date:** {}".format(eval_date))
435
  st.markdown("**Number of evaluated system-model variants:** {}".format(no_of_evaluated_systems))
436
  st.markdown("**Number of evaluated subsets:** {}".format(no_of_eval_subsets))
 
473
  st.dataframe(df_wer_per_system_from_per_dataset, height = h_df_per_system_per_dataset )
474
 
475
  st.subheader("Boxplot showing {} per {} sorted by median values".format(metric, analysis_dim))
476
+ fig = box_plot_per_dimension_subsets(df_per_dataset, metric, analysis_dim, "{} per {} for dataset {}".format(metric, analysis_dim, dataset_short_name), analysis_dim +' of dataset ' + dataset_short_name , metric + " (%)", "system")
477
  st.pyplot(fig, clear_figure=True, use_container_width=True)
478
 
479
  ### IMPACT OF NORMALIZATION ON ERROR RATES #####
 
528
 
529
  dataset = st.selectbox("Select Dataset", datasets, index=datasets.index('amu-cai/pl-asr-bigos-v2-secret'), key="select_dataset_scenarios")
530
 
531
+ if dataset == "amu-cai/pl-asr-bigos-v2-secret":
532
+ dataset_short_name = "BIGOS"
533
+ elif dataset == "pelcra/pl-asr-pelcra-for-bigos-secret":
534
+ dataset_short_name = "PELCRA"
535
+ else:
536
+ dataset_short_name = "UNKNOWN"
537
+
538
  # read the latest results for the selected dataset
539
  print("Reading the latest results for dataset: ", dataset)
540
  df_per_sample_all, df_per_dataset_all = read_latest_results(dataset, split, codename_to_shortname_mapping)
 
580
  st.subheader("Best and worst systems for dataset {}".format(dataset))
581
  df_best_worse_systems = pd.DataFrame(data, columns=header)
582
  # do not display index
583
+ st.dataframe(df_best_worse_systems, hide_index=True)
584
 
585
  st.subheader("Comparison of average WER for best systems")
586
  df_per_dataset_best_systems = df_per_dataset_with_asr_systems_meta[df_per_dataset_with_asr_systems_meta['system'].isin([free_system_with_best_wer, commercial_system_with_best_wer])]
 
635
  # Y is thw average WER
636
  # make each point a different color
637
  # provide legend with system names
638
+ fig, ax = plt.subplots(figsize=(10, 7))
639
+
640
+ # Define larger jitter for close points
641
+ jitter_x = 5
642
+ jitter_y = 0.2
643
+
644
+ # Alternate marker shapes to distinguish overlapping points
645
+ marker_styles = ['o', 's', 'D', '^', 'v', '<', '>'] # Circle, square, diamond, and other shapes
646
+ marker_dict = {system: marker_styles[i % len(marker_styles)] for i, system in enumerate(free_systems_wer['system'].unique())}
647
+
648
  for system in free_systems_wer['system'].unique():
649
  subset = free_systems_wer[free_systems_wer['system'] == system]
650
+ marker_style = marker_dict[system]
651
+
652
+ # Scatter plot with distinct marker shapes for each system
653
+ ax.scatter(
654
+ subset['Parameters [M]'] + jitter_x * (np.random.rand(len(subset)) - 0.5), # Apply jitter to x for overlap
655
+ subset['WER'] + jitter_y * (np.random.rand(len(subset)) - 0.5), # Apply jitter to y for overlap
656
+ label=system, s=100, alpha=0.7, edgecolor='black', marker=marker_style
657
+ )
658
+
659
+ # Add text annotations with dynamic positioning to avoid overlap with y-axis
660
  for i, point in subset.iterrows():
661
+ # Adjust position to avoid overlap with y-axis
662
+ x_offset = 10 if point['Parameters [M]'] < 50 else -10 if i % 2 == 1 else 10 # Push right if close to y-axis
663
+ y_offset = -0.5 if i % 2 == 0 else 0.5 # Alternate vertical offset
664
+
665
+ ax.annotate(
666
+ point['system'],
667
+ (point['Parameters [M]'], point['WER']),
668
+ textcoords="offset points",
669
+ xytext=(x_offset, y_offset),
670
+ ha='right' if x_offset < 0 else 'left',
671
+ fontsize=10,
672
+ bbox=dict(boxstyle="round,pad=0.3", edgecolor='white', facecolor='white', alpha=0.7)
673
+ )
674
+
675
+ # Set axis labels and title
676
+ ax.set_xlabel('Model Size [M Parameters]', fontsize=12)
677
+ ax.set_ylabel('WER (%)', fontsize=12)
678
+ ax.set_title(f'WER vs. Model Size for Dataset {dataset_short_name}', fontsize=14, pad=20)
679
+
680
+ # Adjust legend settings to fit outside the main plot area
681
+ ax.legend(
682
+ title='System', bbox_to_anchor=(0.8, 1), loc='upper left',
683
+ fontsize=8, title_fontsize=9, frameon=True, shadow=False, facecolor='white')
684
+ #)
685
+
686
+ # Add grid lines and minor ticks for better readability
687
+ ax.grid(True, linestyle='--', alpha=0.5)
688
+ ax.minorticks_on()
689
+ ax.tick_params(which='both', direction='in', top=True, right=True)
690
+
691
+
692
+ # increase granularity of y-axis to 20 points per whole range
693
+ # Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
694
+ y_min = 0
695
+ y_max = ax.get_ylim()[1] # Get the current maximum y value
696
+ y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
697
+ ax.set_ylim(y_min, y_max_rounded)
698
+
699
+ # Improve layout spacing
700
+ plt.tight_layout()
701
+
702
+ # Display the plot
703
  st.pyplot(fig)
704
 
705
+
706
  ##################################################################################################################################################
707
  # WER per audio duration
708
 
 
739
  # print dataframe in streamlit
740
  st.dataframe(df_per_sample_wer_audio_pivot)
741
 
742
+ # create scatter plot with WER in function of audio duration
 
 
 
 
743
  fig, ax = plt.subplots()
744
  for system in selected_systems:
745
  subset = df_per_sample_wer_audio[df_per_sample_wer_audio['system'] == system]
 
760
  audio_feature_to_analyze = 'speech_rate_words'
761
  audio_feature_unit = ' [words/s]'
762
  metric = 'WER'
763
+ metric_unit = ' (%)'
764
  no_of_buckets = 10
765
  # calculate average WER per audio duration bucket for the best and worse commercial and free systems
766
  selected_systems = [free_system_with_best_wer, commercial_system_with_best_wer]
 
770
  # print dataframe in streamlit
771
  st.dataframe(df_per_sample_wer_feature_pivot)
772
 
773
+ # Set a threshold to remove outliers - here we use the 97th percentile of WER
774
+ threshold = df_per_sample_wer_feature[metric].quantile(0.97)
 
 
 
 
 
 
 
 
 
 
775
 
776
+ # Remove data points with WER greater than the threshold
777
+ filtered_df = df_per_sample_wer_feature[df_per_sample_wer_feature[metric] <= threshold]
778
+
779
+ # Create figure and axis with larger size
780
+ fig, ax = plt.subplots(figsize=(10, 7))
781
+
782
+ # Scatter plot for each system
783
+ for system in selected_systems:
784
+ subset = filtered_df[filtered_df['system'] == system]
785
+ ax.scatter(subset[audio_feature_to_analyze],
786
+ subset[metric],
787
+ label=system,
788
+ s=subset['number_of_samples'] * 0.5,
789
+ alpha=0.6) # Set alpha for better visibility of overlapping points
790
+
791
+ # Adding a trend line using LOWESS
792
+ lowess = sm.nonparametric.lowess
793
+ trend = lowess(subset[metric], subset[audio_feature_to_analyze], frac=0.3) # Adjust frac to control smoothing
794
+ ax.plot(trend[:, 0], trend[:, 1], label=f'{system} Trend', linestyle='-', linewidth=2)
795
+
796
+ # Set axis labels with improved formatting for readability
797
+ ax.set_xlabel(audio_feature_to_analyze.replace('_', ' ').capitalize() + ' ' + audio_feature_unit )
798
+ ax.set_ylabel(metric + ' ' + metric_unit )
799
+
800
+ # Set an improved title that is more informative
801
+ ax.set_title('Word Error Rate (WER) vs Speech Rate\nBest Performing Free and Paid Systems', fontsize=14)
802
+
803
+ # increase granularity of y-axis to 20 points per whole range
804
+ # Set y-axis limits: lower bound at 0, upper bound to next highest multiple of 5
805
+ y_min = 0
806
+ y_max = ax.get_ylim()[1] # Get the current maximum y value
807
+ y_max_rounded = np.ceil(y_max / 5) * 5 # Round y_max up to the next highest multiple of 5
808
+ ax.set_ylim(y_min, y_max_rounded)
809
+
810
+ # Add a grid to improve readability and alignment
811
+ ax.grid(True, linestyle='--', alpha=0.7)
812
+
813
+ # Place legend outside the plot area to prevent overlapping with data points
814
+ ax.legend(title='System', loc='upper right', bbox_to_anchor=(0.95, 1))
815
+
816
+ # Add tight layout to improve spacing between elements
817
+ fig.tight_layout()
818
+
819
+ # Display the plot
820
  st.pyplot(fig)
821
 
822
 
823
+
824
  ################################################################################################################################################
825
  # WER PER GENDER
826
 
constants.py CHANGED
@@ -1,7 +1,8 @@
1
  ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
2
  The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
 
3
  To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
4
- If you use this work, please use the citation below: <br> \
5
  ```@misc{amu_cai_pl_asr_leaderboard, \
6
  author = {Michał Junczyk}, \
7
  title = {{AMU Polish ASR Leaderboard}}, \
 
1
  ABOUT_INFO = "Polish ASR leaderboard by [AMU-CAI team](https://huggingface.co/amu-cai) aims to provide comprehensive overview of performance of ASR/STT systems for Polish. <br>\
2
  The leaderboard currently supports [BIGOS V2](https://huggingface.co/datasets/amu-cai/pl-asr-bigos-v2) and [PELCRA for BIGOS](https://huggingface.co/datasets/pelcra/pl-asr-pelcra-for-bigos) datasets.<br>\
3
+ If you want to add your system or dataset to the leaderboard, please contact Michał Junczyk (michal.junczyk@amu.edu.pl) or open a pull request on [GitHub](https://github.com/goodmike31/pl-asr-bigos-tools) <br>\
4
  To learn more please read blog post [here](https://huggingface.co/blog/michaljunczyk/introducing-polish-asr-leaderboard).<br> \
5
+ If you use this work, please cite it as follows: <br> \
6
  ```@misc{amu_cai_pl_asr_leaderboard, \
7
  author = {Michał Junczyk}, \
8
  title = {{AMU Polish ASR Leaderboard}}, \
requirements.txt CHANGED
@@ -1,2 +1,2 @@
1
  seaborn
2
-
 
1
  seaborn
2
+ statsmodels
utils.py CHANGED
@@ -9,22 +9,10 @@ from datasets import Dataset
9
  from huggingface_hub import hf_hub_download
10
  import matplotlib.patches as mpatches
11
  import matplotlib as mpl
 
 
12
 
13
 
14
- asr_systems_colors_mapping = {
15
- 'azure': '#1f77b4', # Blue
16
- 'google': '#2ca02c', # Green
17
- 'wav2vec2': '#d62728', # Red
18
- 'nemo': '#9467bd', # Purple
19
- 'assemblyai': '#8c564b', # Brown
20
- 'mms': '#e377c2', # Pink
21
- 'google_v2': '#7f7f7f', # Gray
22
- 'whisper_cloud': '#bcbd22', # Olive
23
- 'whisper_local': '#ff7f0e', # Orange
24
-
25
- # Add or override other systems and their colors
26
- }
27
-
28
  def download_tsv_from_google_sheet(sheet_url):
29
  # Modify the Google Sheet URL to export it as TSV
30
  tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
@@ -181,7 +169,7 @@ def filter_bottom_outliers(df_input, metric, min_threshold):
181
 
182
  def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
183
  # Box plot for WER per dataset
184
- fig, ax = plt.subplots(figsize=(20, 10))
185
 
186
  # generate box plot without outliers
187
  sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
@@ -193,6 +181,90 @@ def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
193
  #return figure
194
  return plt
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
197
  # Create a figure and axis object
198
  fig, ax = plt.subplots(figsize=(12, 8))
 
9
  from huggingface_hub import hf_hub_download
10
  import matplotlib.patches as mpatches
11
  import matplotlib as mpl
12
+ from constants import asr_systems_colors_mapping
13
+ from matplotlib.lines import Line2D
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  def download_tsv_from_google_sheet(sheet_url):
17
  # Modify the Google Sheet URL to export it as TSV
18
  tsv_url = sheet_url.replace('/edit#gid=', '/export?format=tsv&gid=')
 
169
 
170
  def box_plot_per_dimension(df_input, metric, dimension, title, xlabel, ylabel):
171
  # Box plot for WER per dataset
172
+ fig, ax = plt.subplots(figsize=(12, 8))
173
 
174
  # generate box plot without outliers
175
  sns.boxplot(x=dimension, y=metric, data=df_input, order=df_input.groupby(dimension)[metric].median().sort_values().index, showfliers=False)
 
181
  #return figure
182
  return plt
183
 
184
+ def box_plot_per_dimension_subsets(df_input, metric, dimension, title, xlabel, ylabel, category_column, y_limit=100):
185
+ """
186
+ Plots a box plot with individual data points colored and marked by a specified category.
187
+
188
+ Parameters:
189
+ - df_input (pd.DataFrame): Input DataFrame containing data to plot.
190
+ - metric (str): Column name for the metric to plot on the y-axis.
191
+ - dimension (str): Column name for the dimension (x-axis categories).
192
+ - title (str): Title of the plot.
193
+ - xlabel (str): Label for the x-axis.
194
+ - ylabel (str): Label for the y-axis.
195
+ - category_column (str): Column name to use for differentiating data points by color and marker.
196
+ - y_limit (float, optional): Maximum value for the y-axis to limit extreme outliers.
197
+
198
+ Returns:
199
+ - fig: The matplotlib figure object.
200
+ """
201
+
202
+ # Set up the figure and axis with a larger size for readability
203
+ fig, ax = plt.subplots(figsize=(14, 8))
204
+
205
+ # Create a sorted order for the dimension based on the median values of the metric
206
+ order = df_input.groupby(dimension)[metric].median().sort_values().index
207
+
208
+ # Generate box plot without showing extreme outliers
209
+ boxplot = sns.boxplot(
210
+ x=dimension, y=metric, data=df_input,
211
+ order=order, showfliers=False, width=0.6, ax=ax,
212
+ color="white"
213
+ )
214
+
215
+ # Make the box plots transparent by adjusting the facecolor of each box
216
+ for patch in boxplot.artists:
217
+ patch.set_facecolor("white")
218
+ patch.set_alpha(0.2) # Set transparency
219
+
220
+ # Define category-specific colors and marker styles
221
+ categories = df_input[category_column].unique()
222
+ markers = ['o', 's', '^', 'D', 'X', 'P', '*'] # Different marker styles
223
+ colors = sns.color_palette("Set2", len(categories)) # Use a color palette with distinct colors
224
+ category_style_map = {category: {'color': colors[i % len(colors)], 'marker': markers[i % len(markers)]}
225
+ for i, category in enumerate(categories)}
226
+
227
+ # Overlay individual data points with category-specific colors and markers
228
+ for category, style in category_style_map.items():
229
+ # Filter data for each category
230
+ category_data = df_input[(df_input[category_column] == category) & (df_input[metric] <= y_limit)]
231
+ sns.stripplot(
232
+ x=dimension, y=metric, data=category_data,
233
+ order=order, color=style['color'], marker=style['marker'],
234
+ size=5, jitter=True, alpha=1, ax=ax
235
+ )
236
+
237
+ # Set title and axis labels
238
+ ax.set_title(title)
239
+ ax.set_xlabel(xlabel)
240
+ ax.set_ylabel(ylabel)
241
+ ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
242
+
243
+ # Add gridlines for easier comparison
244
+ plt.grid(axis='y', linestyle='--', alpha=0.5)
245
+
246
+ # Set y-axis limit to improve readability
247
+ # Calculate the y-axis maximum as the next multiple of 5 above the data’s max value
248
+ # Make sure the max value does not contain any extreme outliers. Threhold at 98th percentile
249
+ max_value = df_input[metric].quantile(0.99)
250
+
251
+ y_max = (int(max_value / 5) + 1) * 5
252
+
253
+ # Set y-axis ticks with evenly spaced intervals of 5
254
+ ax.set_yticks(range(0, y_max + 1, 5))
255
+ ax.set_ylim(0, y_max)
256
+
257
+ # Create a custom legend with unique entries for each category
258
+ legend_handles = [
259
+ Line2D([0], [0], marker=style['marker'], color='w', markerfacecolor=style['color'], markersize=8, label=category)
260
+ for category, style in category_style_map.items()
261
+ ]
262
+ ax.legend(handles=legend_handles, title=category_column, bbox_to_anchor=(1.05, 1), loc='upper left')
263
+
264
+ # Return the updated figure
265
+ return fig
266
+
267
+
268
  def box_plot_per_dimension_with_colors(df_input, metric, dimension, title, xlabel, ylabel, system_col, type_col):
269
  # Create a figure and axis object
270
  fig, ax = plt.subplots(figsize=(12, 8))