g8a9 commited on
Commit
ad108b7
1 Parent(s): 0542773

add minimal structure and parsing cv17 results

Browse files
Files changed (5) hide show
  1. .gitignore +2 -0
  2. app.py +106 -69
  3. config.py +88 -0
  4. parsing.py +56 -0
  5. requirements.txt +2 -1
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ fair-asr-results
2
+ __pycache__
app.py CHANGED
@@ -2,93 +2,130 @@ import gradio as gr
2
  import pandas as pd
3
  import random
4
  import plotly.express as px
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
 
7
- def greet(name):
8
- return "Hello " + name + "!"
9
-
10
-
11
- def get_results_df():
12
- data = {
13
- "Model": ["Model A", "Model B", "Model C"],
14
- "Avg": [0.85, 0.90, 0.88],
15
- "Gap Read": [0.05, 0.03, 0.04],
16
- "Gap Spontaneous": [0.07, 0.06, 0.05],
17
- }
18
-
19
- df = pd.DataFrame(data)
20
- return df
21
-
22
-
23
- def get_language_performance():
24
- languages = [
25
- "en",
26
- "es",
27
- "de",
28
- "fr",
29
- "it",
30
- "pt",
31
- "nl",
32
- "ru",
33
- "zh",
34
- "ja",
35
- "ko",
36
- "ar",
37
- "hi",
38
- "bn",
39
- "ur",
40
- "tr",
41
- "sv",
42
- ]
43
- data = {
44
- "Model": ["Model A", "Model B", "Model C"],
45
- }
46
-
47
- for lang in languages:
48
- data[lang] = [random.uniform(-100, 100) for _ in range(3)]
49
-
50
- df = pd.DataFrame(data)
51
  return df
52
 
53
 
54
- results = get_results_df()
55
-
56
  with gr.Blocks() as fm_interface:
57
- gr.DataFrame(results)
58
-
59
- language_performance = get_language_performance()
60
- print(language_performance)
61
-
62
- fig1 = px.bar(
63
- language_performance.melt(
64
- id_vars="Model", var_name="Language", value_name="Performance"
65
- ),
66
- x="Language",
67
- y="Performance",
68
- color="Model",
69
- title="Language Performance Plot 1",
70
- barmode="group",
71
  )
72
- fig2 = px.bar(
73
- language_performance.melt(
74
- id_vars="Model", var_name="Language", value_name="Performance"
75
- ),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  x="Language",
77
- y="Performance",
78
  color="Model",
79
- title="Language Performance Plot 2",
 
 
 
 
 
80
  barmode="group",
81
  )
 
 
 
 
 
 
 
82
 
83
- gr.Plot(fig1)
84
- gr.Plot(fig2)
 
 
85
 
86
  tabs = [fm_interface]
87
  titles = ["F-M Setup"]
88
 
89
  with gr.Blocks() as demo:
90
- gr.Markdown("# Fair ASR Leadeboard")
 
 
 
 
 
 
91
  gr.TabbedInterface(tabs, titles)
92
 
 
 
 
 
 
 
 
93
  if __name__ == "__main__":
94
  demo.launch()
 
2
  import pandas as pd
3
  import random
4
  import plotly.express as px
5
+ from huggingface_hub import snapshot_download
6
+ import os
7
+ import logging
8
+
9
+ from config import (
10
+ SETUPS,
11
+ LOCAL_RESULTS_DIR,
12
+ CITATION_BUTTON_TEXT,
13
+ CITATION_BUTTON_LABEL,
14
+ )
15
+ from parsing import read_all_configs
16
+
17
+ # Set up logging
18
+ logging.basicConfig(
19
+ level=logging.INFO,
20
+ format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
21
+ handlers=[
22
+ # logging.FileHandler("app.log"),
23
+ logging.StreamHandler()
24
+ ],
25
+ )
26
+
27
+ logger = logging.getLogger(__name__)
28
+
29
+
30
+ try:
31
+ print("Saving results locally at:", LOCAL_RESULTS_DIR)
32
+ snapshot_download(
33
+ repo_id="g8a9/fair-asr-results",
34
+ local_dir=LOCAL_RESULTS_DIR,
35
+ repo_type="dataset",
36
+ tqdm_class=None,
37
+ etag_timeout=30,
38
+ ignore_patterns=["*samples*", "*transcripts*"],
39
+ token=os.environ.get("TOKEN"),
40
+ )
41
+ except Exception as e:
42
+ raise e
43
 
44
 
45
+ def format_dataframe(df, times_100=False):
46
+ if times_100:
47
+ df = df.map(lambda x: (f"{x * 100:.3f}%" if isinstance(x, (int, float)) else x))
48
+ else:
49
+ df = df.map(lambda x: (f"{x:.4f}" if isinstance(x, (int, float)) else x))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  return df
51
 
52
 
 
 
53
  with gr.Blocks() as fm_interface:
54
+ fm = SETUPS[0]
55
+ setup = fm["majority_group"] + "_" + fm["minority_group"]
56
+ results = read_all_configs(setup)
57
+
58
+ model_results = (
59
+ results.pivot_table(
60
+ index="Model", values="Gap", aggfunc=lambda x: 100 * x.abs().sum()
61
+ )
62
+ .reset_index()
63
+ .sort_values("Gap")
 
 
 
 
64
  )
65
+ best_model = model_results.iloc[0]["Model"]
66
+ print("Best model:", best_model)
67
+ # model_results = format_dataframe(model_results)
68
+ # print(results.head())
69
+
70
+ gr.Markdown("### Sum of Absolute Gaps ⬇️")
71
+ gr.DataFrame(format_dataframe(model_results))
72
+
73
+ gr.Markdown("#### F-M gaps by language")
74
+
75
+ lang_results = results.pivot_table(
76
+ index="Model",
77
+ values="Gap",
78
+ columns="Language",
79
+ ).reset_index()
80
+ gr.DataFrame(format_dataframe(lang_results, times_100=True))
81
+
82
+ # gr.Plot(fig1)
83
+ results["Gap"] = results["Gap"] * 100
84
+ fig = px.bar(
85
+ results,
86
  x="Language",
87
+ y="Gap",
88
  color="Model",
89
+ title="Gaps by Language and Model",
90
+ labels={
91
+ "Gap": "Sum of Absolute Gaps (%)",
92
+ "Language": "Language",
93
+ "Model": "Model",
94
+ },
95
  barmode="group",
96
  )
97
+ lang_order = (
98
+ lang_results.set_index("Model")
99
+ .loc[best_model]
100
+ .sort_values(ascending=False)
101
+ .index
102
+ )
103
+ print(lang_order)
104
 
105
+ # [best_model].sort_values().index
106
+ fig.update_layout(xaxis={"categoryorder": "array", "categoryarray": lang_order})
107
+ gr.Plot(fig)
108
+ # gr.Plot(fig2)
109
 
110
  tabs = [fm_interface]
111
  titles = ["F-M Setup"]
112
 
113
  with gr.Blocks() as demo:
114
+ gr.Markdown("# Twists, Humps, and Pebbles: ASR Leadeboard")
115
+ gr.Markdown(
116
+ """
117
+ Datasets currently included:
118
+ - **Mozilla Common Voice v17**
119
+ """
120
+ )
121
  gr.TabbedInterface(tabs, titles)
122
 
123
+ gr.Textbox(
124
+ value=CITATION_BUTTON_TEXT,
125
+ label=CITATION_BUTTON_LABEL,
126
+ max_lines=6,
127
+ show_copy_button=True,
128
+ )
129
+
130
  if __name__ == "__main__":
131
  demo.launch()
config.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Python file to store configuration and info, e.g., which language
3
+ to use for a particular datasetm or which language a model should be
4
+ evaluated on.
5
+ """
6
+
7
+ LOCAL_RESULTS_DIR = "fair-asr-results"
8
+ SETUPS = [{"majority_group": "male_masculine", "minority_group": "female_feminine"}]
9
+
10
+
11
+ class CVInfo:
12
+ dataset_id: str = "cv_17"
13
+ full_name: str = "Mozilla Common Voice v17"
14
+
15
+ # fmt: off
16
+ langs = [
17
+ "de", "en", "nl", # Germanic
18
+ "ru", "sr", "cs", "sk", # Slavic
19
+ "it", "fr", "es", "ca", "pt", "ro", # Romance
20
+ "sw", # Bantu
21
+ "yo", # Niger-Congo
22
+ "ja", # Japonic
23
+ "hu", "fi", # Uralic
24
+ "ar" # Semitic
25
+ ]
26
+ # fmt: on
27
+
28
+
29
+ dataset2info = {"cv_17": CVInfo}
30
+
31
+
32
+ class WhisperInfo:
33
+ # fmt: off
34
+ langs = [
35
+ "de", "en", "nl", # Germanic
36
+ "ru", "sr", "cs", "sk", # Slavic
37
+ "it", "fr", "es", "ca", "pt", "ro", # Romance
38
+ "sw", # Bantu
39
+ "yo", # Niger-Congo
40
+ "ja", # Japonic
41
+ "hu", "fi", # Uralic
42
+ "ar" # Semitic
43
+ ]
44
+ # fmt: on
45
+
46
+
47
+ class SeamlessInfo:
48
+ # fmt: off
49
+ langs = [
50
+ "de", "en", "nl", # Germanic
51
+ "ru", "sr", "cs", "sk", # Slavic
52
+ "it", "fr", "es", "ca", "pt", "ro", # Romance
53
+ "sw", # Bantu
54
+ "yo", # Niger-Congo
55
+ "ja", # Japonic
56
+ "hu", "fi", # Uralic
57
+ "ar" # Semitic
58
+ ]
59
+ # fmt: on
60
+
61
+
62
+ model2info = {
63
+ "openai--whisper-large-v3": WhisperInfo,
64
+ "openai--whisper-large-v3-turbo": WhisperInfo,
65
+ # "facebook--seamless-m4t-v2-large": SeamlessInfo,
66
+ }
67
+
68
+
69
+ CITATION_BUTTON_LABEL = "Please use this bibtex to cite these results"
70
+ CITATION_BUTTON_TEXT = r"""@inproceedings{attanasio-etal-2024-twists,
71
+ title = "Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps",
72
+ author = "Attanasio, Giuseppe and
73
+ Savoldi, Beatrice and
74
+ Fucci, Dennis and
75
+ Hovy, Dirk",
76
+ editor = "Al-Onaizan, Yaser and
77
+ Bansal, Mohit and
78
+ Chen, Yun-Nung",
79
+ booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
80
+ month = nov,
81
+ year = "2024",
82
+ address = "Miami, Florida, USA",
83
+ publisher = "Association for Computational Linguistics",
84
+ url = "https://aclanthology.org/2024.emnlp-main.1188",
85
+ doi = "10.18653/v1/2024.emnlp-main.1188",
86
+ pages = "21318--21340",
87
+ abstract = "Current automatic speech recognition (ASR) models are designed to be used across many languages and tasks without substantial changes. However, this broad language coverage hides performance gaps within languages, for example, across genders. Our study systematically evaluates the performance of two widely used multilingual ASR models on three datasets, encompassing 19 languages from eight language families and two speaking conditions. Our findings reveal clear gender disparities, with the advantaged group varying across languages and models. Surprisingly, those gaps are not explained by acoustic or lexical properties. However, probing internal model states reveals a correlation with gendered performance gap. That is, the easier it is to distinguish speaker gender in a language using probes, the more the gap reduces, favoring female speakers. Our results show that gender disparities persist even in state-of-the-art models. Our findings have implications for the improvement of multilingual ASR systems, underscoring the importance of accessibility to training data and nuanced evaluation to predict and mitigate gender gaps. We release all code and artifacts at https://github.com/g8a9/multilingual-asr-gender-gap.",
88
+ }"""
parsing.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from typing import List
3
+ from os.path import join as opj
4
+ import json
5
+ from config import dataset2info, model2info, LOCAL_RESULTS_DIR
6
+
7
+
8
+ def load_language_results(
9
+ model_id: str, dataset_id: str, lang_ids: List[str], setup: str
10
+ ):
11
+ lang_gaps = dict()
12
+ for lang in lang_ids:
13
+ with open(
14
+ opj(
15
+ LOCAL_RESULTS_DIR,
16
+ "evaluation",
17
+ dataset_id,
18
+ f"results_{model_id}_{dataset_id}_devtest_{lang}_gender_{setup}.json",
19
+ )
20
+ ) as fp:
21
+ data = json.load(fp)
22
+ lang_gaps[lang] = data[f"{data['eval_metric']}_diff_mean"]
23
+ return lang_gaps
24
+
25
+
26
+ def read_all_configs(setup: str):
27
+
28
+ all_datasets = dataset2info.keys()
29
+ print("Parsing results datasets:", all_datasets)
30
+ all_models = model2info.keys()
31
+ print("Parsing results models:", all_models)
32
+
33
+ rows = list()
34
+ for dataset_id in all_datasets:
35
+ for model_id in all_models:
36
+ lang_gaps = load_language_results(
37
+ model_id, dataset_id, dataset2info[dataset_id].langs, setup
38
+ )
39
+
40
+ rows.extend(
41
+ [
42
+ {
43
+ "Model": model_id,
44
+ "Dataset": dataset_id,
45
+ "Language": lang,
46
+ "Gap": lang_gaps[lang],
47
+ }
48
+ for lang in lang_gaps
49
+ ]
50
+ )
51
+
52
+ results_df = pd.DataFrame(rows)
53
+ results_df = results_df.drop(columns=["Dataset"])
54
+ # results_df = results_df.sort_values(by="Mean Gap", ascending=True)
55
+
56
+ return results_df
requirements.txt CHANGED
@@ -1,2 +1,3 @@
1
  gradio
2
- plotly
 
 
1
  gradio
2
+ plotly
3
+ pandas