File size: 9,572 Bytes
ef32dc8
 
8be74ba
93010f6
8be74ba
 
 
168c0f0
 
8be74ba
168c0f0
 
8be74ba
 
bba4630
0f588e7
 
 
 
 
 
8be74ba
 
 
 
 
 
 
0f588e7
8be74ba
 
 
 
 
 
 
 
 
 
5769259
8be74ba
 
0f588e7
 
8be74ba
 
 
 
 
 
c71f75e
8be74ba
 
 
ffe2e9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8be74ba
 
 
 
 
5769259
8be74ba
a4430c5
 
 
 
 
8be74ba
 
 
 
a4430c5
8be74ba
ef32dc8
 
 
8be74ba
 
 
 
 
 
 
 
ef32dc8
8be74ba
 
ef32dc8
8be74ba
ef32dc8
8be74ba
 
 
 
 
 
 
ef32dc8
 
 
8be74ba
 
 
ef32dc8
2f01202
ef32dc8
 
ec69747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import pandas as pd

banner_url = "https://huggingface.co/spaces/elmresearchcenter/open_universal_arabic_asr_leaderboard/resolve/main/banner.png"
BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 20vw; min-width: 300px; max-width: 600px;"> </div>'

INTRODUCTION_TEXT = """
πŸ“–**Universal Arabic ASR Leaderboard (Extended Edition)**πŸ“– is a fork of the [Open Universal Arabic ASR Leaderboard](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard), extended to benchmark **both open-source and closed-source** multi-dialect Arabic ASR models on various multi-dialect datasets.
\nWhile the original leaderboard is dedicated exclusively to open-source models, this fork broadens the scope to include commercial/closed-source providers such as **ElevenLabs, OpenAI, Deepgram, Microsoft**, and others β€” enabling a more comprehensive comparison across the Arabic ASR landscape.
\nApart from the WER%/CER% for each test set, we also report the Average WER%/CER% and rank the models based on the Average WER, from lowest to highest.
\nFor more context on the motivation behind this fork, see the [original discussion](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard/issues/5).
\nFor more detailed analysis such as models' robustness, speaker adaption, model efficiency and memory usage, please check the original [paper](https://arxiv.org/pdf/2412.13788).
"""

CITATION_BUTTON_TEXT = """
@article{wang2024open,
  title={Open Universal Arabic ASR Leaderboard},
  author={Wang, Yingzhi and Alhmoud, Anas and Alqurishi, Muhammad},
  journal={arXiv preprint arXiv:2412.13788},
  year={2024}
}
"""

METRICS_TAB_TEXT = METRICS_TAB_TEXT = """
## Metrics
We report both the Word Error Rate (WER) and Character Error Rate (CER) metrics.
## Reproduction
The Open Universal Arabic ASR Leaderboard will be a continuous benchmark project. 
\nWe open-source the evaluation scripts at our [GitHub repo](https://github.com/Natural-Language-Processing-Elm/open_universal_arabic_asr_leaderboard).
\nPlease launch a discussion in our GitHub repo to let us know if you want to learn about the performance of a new model.

## Benchmark datasets
| Test Set                                                                                        | Num Dialects   | Test (h)    |
|-------------------------------------------------------------------------------------------------|----------------|-------------|
| [SADA](https://www.kaggle.com/datasets/sdaiancai/sada2022)                                      | 10             | 10.7        |
| [Common Voice 18.0](https://commonvoice.mozilla.org/en/datasets)                                | 25             | 12.6        |
| [MASC (Clean-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 7              | 10.5        |
| [MASC (Noisy-Test)](https://ieee-dataport.org/open-access/masc-massive-arabic-speech-corpus)    | 8              | 14.9        |
| [MGB-2](http://www.mgb-challenge.org/MGB-2.html)                                                | Unspecified    | 9.6         |
| [Casablanca](https://huggingface.co/datasets/UBC-NLP/Casablanca)                                | 8              | 7.7         |

## In-depth Analysis
We also provide a comprehensive analysis of the models' robustness, speaker adaptation, inference efficiency and memory consumption.
\nPlease check our [paper](https://arxiv.org/pdf/2412.13788) to learn more.
"""


def styled_message(message):
    return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"

LAST_UPDATED = "Apr 24th 2025:[New models included: cntxt-ai-munsit-1, elevenlabs-scribe-v1, microsoft-azure-stt, openai-gpt-4o-transcribe]"


results = {
    "Model": ["omnilingual-asr/omniASR_LLM_7B", "omnilingual-asr/omniASR_LLM_3B", "omnilingual-asr/omniASR_LLM_1B", "Qwen/Qwen3-Omni-30B-A3B-Instruct", "deepgram-nova-3", "cntxt-ai-munsit-1", "elevenlabs-scribe-v1", "microsoft-azure-stt", "openai-gpt-4o-transcribe", "nvidia-conformer-ctc-large-arabic (lm)", "nvidia-conformer-ctc-large-arabic (greedy)", "openai/whisper-large-v3", "facebook/seamless-m4t-v2-large", "openai/whisper-large-v3-turbo", "openai/whisper-large-v2", "openai/whisper-large", "asafaya/hubert-large-arabic-transcribe", "openai/whisper-medium", "nvidia-Parakeet-ctc-1.1b-concat", "nvidia-Parakeet-ctc-1.1b-universal", "facebook/mms-1b-all", "openai/whisper-small", "whitefox123/w2v-bert-2.0-arabic-4", "jonatasgrosman/wav2vec2-large-xlsr-53-arabic", "speechbrain/asr-wav2vec2-commonvoice-14-ar"],
    "Average WER⬇️": [28.32, 29.96, 29.96, 30.71, 35.87, 26.68, 40.05, 45.72, 44.97, 32.91, 34.74, 36.86, 38.16, 40.05, 40.20, 42.57, 45.50, 45.57, 46.54, 51.96, 54.54, 55.13, 58.13, 60.98, 65.74],
    "Average CER": [12.52, 13.77, 13.40, 13.67, 15.26, 10.05, 14.75, 19.45, 24.31, 13.84, 13.37, 17.21, 17.03, 18.87, 19.55, 20.49, 17.35, 22.27, 23.88, 25.19, 21.45, 21.68, 27.62, 25.61, 30.93],
    "SADA WER": [41.61, 46.18, 43.84, 44.82, 46.57, 27.71, 49.44, 58.5, 66.47, 44.52, 47.26, 55.96, 62.52, 60.36, 57.46, 63.24, 67.82, 67.71, 70.70, 73.58, 77.48, 78.02, 87.34, 86.82, 88.54],
    "SADA CER": [24.95, 27.27, 24.54, 26.11, 25.55, 11.65, 23.33, 35.39, 49.57, 23.76, 22.54, 34.62, 37.61, 37.67, 36.59, 40.16, 31.83, 43.83, 46.70, 49.48, 37.50, 33.17, 56.75, 44.20, 50.28],
    "Common Voice\nWER": [8.75, 9.15, 9.55, 11.46, 17.68, 10.42, 28.27, 33.775, 28.19, 8.80, 10.60, 17.83, 21.70, 25.73, 21.77, 26.04, 8.01, 28.07, 26.34, 40.01, 26.52, 24.18, 41.79, 23.00, 29.17],
    "Common Voice\nCER": [2.71, 2.80, 2.97, 4.28, 4.59, 3.21, 7.33, 9.29, 8.14, 2.77, 3.05, 5.74, 6.24, 10.89, 7.44, 9.61, 2.37, 10.38, 9.82, 14.64, 7.21, 6.79, 15.75, 6.64, 9.85],
    "MASC(clean-test)\nWER": [19.69, 19.90, 20.03, 21.47, 25.71, 21.74, 31.93, 40.66, 31.53, 23.74, 24.12, 24.66, 25.04, 25.51, 27.25, 28.89, 32.94, 29.99, 30.49, 36.16, 38.82, 35.93, 37.82, 42.75, 49.10],
    "MASC(clean-test)\nCER": [5.76, 6.13, 6.14, 5.59, 8.24, 5.8, 8.23, 14.735, 8.85, 5.63, 5.63, 7.24, 7.19, 7.55, 8.28, 9.05, 7.15, 8.98, 8.41, 10.29, 10.36, 9.01, 11.92, 11.87, 16.37],
    "MASC(noisy-test)\nWER": [29.29, 30.03, 30.26, 30.85, 33.91, 28.08, 41.23, 45.645, 43.29, 34.29, 35.64, 34.63, 33.24, 37.16, 38.55, 40.79, 50.16, 42.91, 45.95, 50.03, 57.33, 56.36, 53.28, 64.27, 69.57],
    "MASC(noisy-test)\nCER": [10.66, 11.27, 11.18, 11.28, 13.21, 8.88, 13.14, 15.77, 18.81, 11.07, 11.02, 12.89, 11.92, 13.93, 15.49, 16.31, 15.62, 17.49, 18.72, 20.09, 19.76, 19.43, 21.93, 24.17, 30.17],
    "MGB-2 WER": [14.13, 14.22, 15.34, 13.09, 27.76, 12.1, 25.68, 30.91, 29.62, 17.20, 19.69, 16.26, 20.23, 17.75, 25.17, 24.28, 37.51, 29.32, 24.94, 30.68, 39.16, 48.64, 40.66, 56.29, 64.37],
    "MGB-2 CER": [7.10, 7.06, 7.56, 6.20, 8.34, 5.27, 9.27, 13.7, 17.34, 6.87, 7.46, 7.74, 9.37, 8.34, 13.48, 12.10, 11.07, 14.82, 9.87, 11.36, 13.48, 15.56, 19.39, 20.44, 26.56],
    "Casablanca\nWER": [56.46, 60.27, 60.68, 62.55, 63.57, 60.04, 63.77, 64.84, 70.72, 68.90, 71.13, 71.81, 66.25, 73.79, 71.01, 72.18, 76.53, 75.44, 80.80, 81.30, 87.95, 87.64, 87.88, 92.72, 93.68],
    "Casablanca\nCER": [23.96, 28.06, 28.02, 28.53, 31.63, 25.51, 27.17, 27.84, 43.15, 32.97, 30.50, 35.04, 29.85, 34.83, 36.00, 35.71, 36.03, 38.12, 49.77, 45.31, 40.41, 46.12, 39.99, 46.33, 52.36],
}

original_df = pd.DataFrame(results)
original_df.sort_values(by="Average WER⬇️", inplace=True)

TYPES = ['str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

LEADERBOARD_CSS = """
#leaderboard-table th .header-content {
    white-space: nowrap;
}
"""

def request_model(model_text):
    return styled_message("πŸ€— Please launch a discussion in our GitHub repo, thank you. πŸ€—")

with gr.Blocks(css=LEADERBOARD_CSS) as demo:
    gr.HTML(BANNER, elem_id="banner")
    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")

    with gr.Tabs(elem_classes="tab-buttons") as tabs:
        with gr.TabItem("πŸ… Leaderboard", elem_id="od-benchmark-tab-table", id=0):
            leaderboard_table = gr.Dataframe(
                value=original_df,
                datatype=TYPES,
                elem_id="leaderboard-table",
                interactive=False,
                visible=True,
            )

        with gr.TabItem("πŸ“ˆ Metrics", elem_id="od-benchmark-tab-table", id=1):
            gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")

        with gr.TabItem("βœ‰οΈβœ¨ Request a model here!", elem_id="od-benchmark-tab-table", id=2):
            with gr.Column():
                gr.Markdown("# βœ‰οΈβœ¨ Request results for a new model here!", elem_classes="markdown-text")
                model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
                mdw_submission_result = gr.Markdown()
                btn_submit = gr.Button(value="πŸš€ Request")
                btn_submit.click(request_model, [model_name_textbox], mdw_submission_result)

    gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")

    with gr.Row():
        with gr.Accordion("πŸ“™ Citation", open=False):
            gr.Textbox(
                value=CITATION_BUTTON_TEXT, lines=7,
                label="Copy the BibTeX snippet to cite this source",
                elem_id="citation-button",
                # show_copy_button=True,
            )

demo.launch(allowed_paths=["banner.png"], ssr_mode=False)