Spaces:

g8a9
/

fair-asr-leaderboard

Sleeping

App Files Files Community

g8a9 commited on Dec 13, 2024

Commit

ad108b7

1 Parent(s): 0542773

add minimal structure and parsing cv17 results

Browse files

Files changed (5) hide show

.gitignore +2 -0
app.py +106 -69
config.py +88 -0
parsing.py +56 -0
requirements.txt +2 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ fair-asr-results
2	+ __pycache__

app.py CHANGED Viewed

@@ -2,93 +2,130 @@ import gradio as gr
 import pandas as pd
 import random
 import plotly.express as px
-def greet(name):
-    return "Hello " + name + "!"
-def get_results_df():
-    data = {
-        "Model": ["Model A", "Model B", "Model C"],
-        "Avg": [0.85, 0.90, 0.88],
-        "Gap Read": [0.05, 0.03, 0.04],
-        "Gap Spontaneous": [0.07, 0.06, 0.05],
-    }
-    df = pd.DataFrame(data)
-    return df
-def get_language_performance():
-    languages = [
-        "en",
-        "es",
-        "de",
-        "fr",
-        "it",
-        "pt",
-        "nl",
-        "ru",
-        "zh",
-        "ja",
-        "ko",
-        "ar",
-        "hi",
-        "bn",
-        "ur",
-        "tr",
-        "sv",
-    ]
-    data = {
-        "Model": ["Model A", "Model B", "Model C"],
-    }
-    for lang in languages:
-        data[lang] = [random.uniform(-100, 100) for _ in range(3)]
-    df = pd.DataFrame(data)
     return df
-results = get_results_df()
 with gr.Blocks() as fm_interface:
-    gr.DataFrame(results)
-    language_performance = get_language_performance()
-    print(language_performance)
-    fig1 = px.bar(
-        language_performance.melt(
-            id_vars="Model", var_name="Language", value_name="Performance"
-        ),
-        x="Language",
-        y="Performance",
-        color="Model",
-        title="Language Performance Plot 1",
-        barmode="group",
     )
-    fig2 = px.bar(
-        language_performance.melt(
-            id_vars="Model", var_name="Language", value_name="Performance"
-        ),
         x="Language",
-        y="Performance",
         color="Model",
-        title="Language Performance Plot 2",
         barmode="group",
     )
-    gr.Plot(fig1)
-    gr.Plot(fig2)
 tabs = [fm_interface]
 titles = ["F-M Setup"]
 with gr.Blocks() as demo:
-    gr.Markdown("# Fair ASR Leadeboard")
     gr.TabbedInterface(tabs, titles)
 if __name__ == "__main__":
     demo.launch()

 import pandas as pd
 import random
 import plotly.express as px
+from huggingface_hub import snapshot_download
+import os
+import logging
+from config import (
+    SETUPS,
+    LOCAL_RESULTS_DIR,
+    CITATION_BUTTON_TEXT,
+    CITATION_BUTTON_LABEL,
+)
+from parsing import read_all_configs
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
+    handlers=[
+        # logging.FileHandler("app.log"),
+        logging.StreamHandler()
+    ],
+)
+logger = logging.getLogger(__name__)
+try:
+    print("Saving results locally at:", LOCAL_RESULTS_DIR)
+    snapshot_download(
+        repo_id="g8a9/fair-asr-results",
+        local_dir=LOCAL_RESULTS_DIR,
+        repo_type="dataset",
+        tqdm_class=None,
+        etag_timeout=30,
+        ignore_patterns=["*samples*", "*transcripts*"],
+        token=os.environ.get("TOKEN"),
+    )
+except Exception as e:
+    raise e
+def format_dataframe(df, times_100=False):
+    if times_100:
+        df = df.map(lambda x: (f"{x * 100:.3f}%" if isinstance(x, (int, float)) else x))
+    else:
+        df = df.map(lambda x: (f"{x:.4f}" if isinstance(x, (int, float)) else x))
     return df
 with gr.Blocks() as fm_interface:
+    fm = SETUPS[0]
+    setup = fm["majority_group"] + "_" + fm["minority_group"]
+    results = read_all_configs(setup)
+    model_results = (
+        results.pivot_table(
+            index="Model", values="Gap", aggfunc=lambda x: 100 * x.abs().sum()
+        )
+        .reset_index()
+        .sort_values("Gap")
     )
+    best_model = model_results.iloc[0]["Model"]
+    print("Best model:", best_model)
+    # model_results = format_dataframe(model_results)
+    # print(results.head())
+    gr.Markdown("### Sum of Absolute Gaps ⬇️")
+    gr.DataFrame(format_dataframe(model_results))
+    gr.Markdown("#### F-M gaps by language")
+    lang_results = results.pivot_table(
+        index="Model",
+        values="Gap",
+        columns="Language",
+    ).reset_index()
+    gr.DataFrame(format_dataframe(lang_results, times_100=True))
+    # gr.Plot(fig1)
+    results["Gap"] = results["Gap"] * 100
+    fig = px.bar(
+        results,
         x="Language",
+        y="Gap",
         color="Model",
+        title="Gaps by Language and Model",
+        labels={
+            "Gap": "Sum of Absolute Gaps (%)",
+            "Language": "Language",
+            "Model": "Model",
+        },
         barmode="group",
     )
+    lang_order = (
+        lang_results.set_index("Model")
+        .loc[best_model]
+        .sort_values(ascending=False)
+        .index
+    )
+    print(lang_order)
+    # [best_model].sort_values().index
+    fig.update_layout(xaxis={"categoryorder": "array", "categoryarray": lang_order})
+    gr.Plot(fig)
+    # gr.Plot(fig2)
 tabs = [fm_interface]
 titles = ["F-M Setup"]
 with gr.Blocks() as demo:
+    gr.Markdown("# Twists, Humps, and Pebbles: ASR Leadeboard")
+    gr.Markdown(
+        """
+Datasets currently included:
+- **Mozilla Common Voice v17**
+"""
+    )
     gr.TabbedInterface(tabs, titles)
+    gr.Textbox(
+        value=CITATION_BUTTON_TEXT,
+        label=CITATION_BUTTON_LABEL,
+        max_lines=6,
+        show_copy_button=True,
+    )
 if __name__ == "__main__":
     demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,88 @@

+"""
+Python file to store configuration and info, e.g., which language
+to use for a particular datasetm or which language a model should be
+evaluated on.
+"""
+LOCAL_RESULTS_DIR = "fair-asr-results"
+SETUPS = [{"majority_group": "male_masculine", "minority_group": "female_feminine"}]
+class CVInfo:
+    dataset_id: str = "cv_17"
+    full_name: str = "Mozilla Common Voice v17"
+    # fmt: off
+    langs = [
+        "de", "en", "nl",  # Germanic
+        "ru", "sr", "cs", "sk",  # Slavic
+        "it", "fr", "es", "ca", "pt", "ro",  # Romance
+        "sw",  # Bantu
+        "yo",  # Niger-Congo
+        "ja",  # Japonic
+        "hu", "fi",  # Uralic
+        "ar"  # Semitic
+    ]
+    # fmt: on
+dataset2info = {"cv_17": CVInfo}
+class WhisperInfo:
+    # fmt: off
+    langs = [
+        "de", "en", "nl",  # Germanic
+        "ru", "sr", "cs", "sk",  # Slavic
+        "it", "fr", "es", "ca", "pt", "ro",  # Romance
+        "sw",  # Bantu
+        "yo",  # Niger-Congo
+        "ja",  # Japonic
+        "hu", "fi",  # Uralic
+        "ar"  # Semitic
+    ]
+    # fmt: on
+class SeamlessInfo:
+    # fmt: off
+    langs = [
+        "de", "en", "nl",  # Germanic
+        "ru", "sr", "cs", "sk",  # Slavic
+        "it", "fr", "es", "ca", "pt", "ro",  # Romance
+        "sw",  # Bantu
+        "yo",  # Niger-Congo
+        "ja",  # Japonic
+        "hu", "fi",  # Uralic
+        "ar"  # Semitic
+    ]
+    # fmt: on
+model2info = {
+    "openai--whisper-large-v3": WhisperInfo,
+    "openai--whisper-large-v3-turbo": WhisperInfo,
+    # "facebook--seamless-m4t-v2-large": SeamlessInfo,
+}
+CITATION_BUTTON_LABEL = "Please use this bibtex to cite these results"
+CITATION_BUTTON_TEXT = r"""@inproceedings{attanasio-etal-2024-twists,
+    title = "Twists, Humps, and Pebbles: Multilingual Speech Recognition Models Exhibit Gender Performance Gaps",
+    author = "Attanasio, Giuseppe  and
+      Savoldi, Beatrice  and
+      Fucci, Dennis  and
+      Hovy, Dirk",
+    editor = "Al-Onaizan, Yaser  and
+      Bansal, Mohit  and
+      Chen, Yun-Nung",
+    booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
+    month = nov,
+    year = "2024",
+    address = "Miami, Florida, USA",
+    publisher = "Association for Computational Linguistics",
+    url = "https://aclanthology.org/2024.emnlp-main.1188",
+    doi = "10.18653/v1/2024.emnlp-main.1188",
+    pages = "21318--21340",
+    abstract = "Current automatic speech recognition (ASR) models are designed to be used across many languages and tasks without substantial changes. However, this broad language coverage hides performance gaps within languages, for example, across genders. Our study systematically evaluates the performance of two widely used multilingual ASR models on three datasets, encompassing 19 languages from eight language families and two speaking conditions. Our findings reveal clear gender disparities, with the advantaged group varying across languages and models. Surprisingly, those gaps are not explained by acoustic or lexical properties. However, probing internal model states reveals a correlation with gendered performance gap. That is, the easier it is to distinguish speaker gender in a language using probes, the more the gap reduces, favoring female speakers. Our results show that gender disparities persist even in state-of-the-art models. Our findings have implications for the improvement of multilingual ASR systems, underscoring the importance of accessibility to training data and nuanced evaluation to predict and mitigate gender gaps. We release all code and artifacts at https://github.com/g8a9/multilingual-asr-gender-gap.",
+}"""

parsing.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import pandas as pd
+from typing import List
+from os.path import join as opj
+import json
+from config import dataset2info, model2info, LOCAL_RESULTS_DIR
+def load_language_results(
+    model_id: str, dataset_id: str, lang_ids: List[str], setup: str
+):
+    lang_gaps = dict()
+    for lang in lang_ids:
+        with open(
+            opj(
+                LOCAL_RESULTS_DIR,
+                "evaluation",
+                dataset_id,
+                f"results_{model_id}_{dataset_id}_devtest_{lang}_gender_{setup}.json",
+            )
+        ) as fp:
+            data = json.load(fp)
+            lang_gaps[lang] = data[f"{data['eval_metric']}_diff_mean"]
+    return lang_gaps
+def read_all_configs(setup: str):
+    all_datasets = dataset2info.keys()
+    print("Parsing results datasets:", all_datasets)
+    all_models = model2info.keys()
+    print("Parsing results models:", all_models)
+    rows = list()
+    for dataset_id in all_datasets:
+        for model_id in all_models:
+            lang_gaps = load_language_results(
+                model_id, dataset_id, dataset2info[dataset_id].langs, setup
+            )
+            rows.extend(
+                [
+                    {
+                        "Model": model_id,
+                        "Dataset": dataset_id,
+                        "Language": lang,
+                        "Gap": lang_gaps[lang],
+                    }
+                    for lang in lang_gaps
+                ]
+            )
+    results_df = pd.DataFrame(rows)
+    results_df = results_df.drop(columns=["Dataset"])
+    # results_df = results_df.sort_values(by="Mean Gap", ascending=True)
+    return results_df

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio
-plotly

 gradio
+plotly
+pandas