|
import gradio as gr |
|
import pandas as pd |
|
from css_html_js import custom_css |
|
|
|
TITLE = """<h1 align="center" id="space-title">π²πΎ Malaysian Speech-to-Text Leaderboard</h1>""" |
|
|
|
INTRODUCTION_TEXT = """ |
|
π The π²πΎ Malaysian Speech-to-Text Leaderboard aims to track, rank and evaluate Malaysian Speech-to-Text models. All notebooks at https://github.com/mesolitica/malaysian-stt-benchmarks |
|
|
|
## Dataset |
|
|
|
π We evaluate models based on 3 datasets, |
|
|
|
1. Malaya-Speech test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/malaya-speech |
|
2. Fleurs MS-MY test set, Malay language, https://huggingface.co/datasets/mesolitica/speech-test-set/tree/main/fleurs-ms-my |
|
3. IMDA TTS first 700 audio files, English language but with Manglish slang, https://huggingface.co/datasets/mesolitica/IMDA-TTS |
|
|
|
## Heavy postprocess test set |
|
|
|
1. We filtered test set that contain numbers because malaya-speech transducer trained on normalized numbers. |
|
2. We lower case because malaya-speech transducer trained on lower case. |
|
3. We removed punctuation because malaya-speech transducer trained without punctuation. |
|
""" |
|
|
|
open_source = [ |
|
{ |
|
'model': 'goodtape.io', |
|
'model size FP16 (MB)': None, |
|
'Malaya-Speech test CER': 0.09504487340205486, |
|
'Malaya-Speech test WER': 0.1691902868373457, |
|
'Fleurs MY-MS CER': 0.03643102801583697, |
|
'Fleurs MY-MS WER': 0.08672758155453257, |
|
}, |
|
{ |
|
'model': 'openai/whisper-large-v3', |
|
'model size FP16 (MB)': 3090, |
|
'Malaya-Speech test CER': 0.0349251317825172, |
|
'Malaya-Speech test WER': 0.1032828282828283, |
|
'Fleurs MY-MS CER': 0.026055551396846878, |
|
'Fleurs MY-MS WER': 0.07652049926522007, |
|
'IMDA TTS CER': 0.016648493852990828, |
|
'IMDA TTS WER': 0.0386282289139432, |
|
}, |
|
{ |
|
'model': 'openai/whisper-medium', |
|
'model size FP16 (MB)': 1530, |
|
'Malaya-Speech test CER': 0.05064920144820262, |
|
'Malaya-Speech test WER': 0.17534205321090568, |
|
'Fleurs MY-MS CER': 0.04366882208520179, |
|
'Fleurs MY-MS WER': 0.13546055192128273, |
|
'IMDA TTS CER': 0.02065587879424904, |
|
'IMDA TTS WER': 0.047277690563404855, |
|
}, |
|
{ |
|
'model': 'openai/whisper-small', |
|
'model size FP16 (MB)': 483.5, |
|
'Malaya-Speech test CER': 0.07485209857268262, |
|
'Malaya-Speech test WER': 0.25748516055893106, |
|
'Fleurs MY-MS CER': 0.06781078047622793, |
|
'Fleurs MY-MS WER': 0.21953142859857497, |
|
'IMDA TTS CER': 0.024812471688517194, |
|
'IMDA TTS WER': 0.058901277294134434, |
|
}, |
|
{ |
|
'model': 'openai/whisper-base', |
|
'model size FP16 (MB)': 145, |
|
'Malaya-Speech test CER': 0.3574879236610538, |
|
'Malaya-Speech test WER': 0.8303456599563157, |
|
'Fleurs MY-MS CER': 0.1319124653794061, |
|
'Fleurs MY-MS WER': 0.40499286081235003, |
|
'IMDA TTS CER': 0.03914533450681607, |
|
'IMDA TTS WER': 0.08951682444539587, |
|
}, |
|
{ |
|
'model': 'openai/whisper-tiny', |
|
'model size FP16 (MB)': 75.5, |
|
'Malaya-Speech test CER': 0.26941094281472105, |
|
'Malaya-Speech test WER': 0.7414099751189915, |
|
'Fleurs MY-MS CER': 0.38749733168917505, |
|
'Fleurs MY-MS WER': 0.812253445128297, |
|
'IMDA TTS CER': 0.048805770734828904, |
|
'IMDA TTS WER': 0.11150629529200957, |
|
}, |
|
{ |
|
'model': 'mesolitica/malaysian-whisper-medium', |
|
'model size FP16 (MB)': 1530, |
|
'Malaya-Speech test CER': 0.05622483776367814, |
|
'Malaya-Speech test WER': 0.14406629724252673, |
|
'Fleurs MY-MS CER': 0.025543266604368554, |
|
'Fleurs MY-MS WER': 0.07940219915492629, |
|
'IMDA TTS CER': 0.01971214262944062, |
|
'IMDA TTS WER': 0.047223078508792794, |
|
}, |
|
{ |
|
'model': 'mesolitica/malaysian-whisper-small', |
|
'model size FP16 (MB)': 483.5, |
|
'Malaya-Speech test CER': 0.049162419174983304, |
|
'Malaya-Speech test WER': 0.15926901346983313, |
|
'Fleurs MY-MS CER': 0.035517572531147, |
|
'Fleurs MY-MS WER': 0.10938718963023729, |
|
'IMDA TTS CER': 0.024228721439634855, |
|
'IMDA TTS WER': 0.05546294182008469, |
|
}, |
|
{ |
|
'model': 'mesolitica/malaysian-whisper-base', |
|
'model size FP16 (MB)': 145, |
|
'Malaya-Speech test CER': 0.07242006488452603, |
|
'Malaya-Speech test WER': 0.22081683495617924, |
|
'Fleurs MY-MS CER': 0.06639564802362424, |
|
'Fleurs MY-MS WER': 0.19675812232021192, |
|
'IMDA TTS CER': 0.03982418421412676, |
|
'IMDA TTS WER': 0.08917690642690643, |
|
}, |
|
{ |
|
'model': 'mesolitica/malaysian-whisper-tiny', |
|
'model size FP16 (MB)': 75.5, |
|
'Malaya-Speech test CER': 0.09423990117534763, |
|
'Malaya-Speech test WER': 0.295029492365558, |
|
'Fleurs MY-MS CER': 0.13390519685940314, |
|
'Fleurs MY-MS WER': 0.3461808122686204, |
|
'IMDA TTS CER': 0.07957313474501154, |
|
'IMDA TTS WER': 0.1421708648494363, |
|
}, |
|
{ |
|
'model': 'mesolitica/conformer-large-malay-whisper', |
|
'model size FP16 (MB)': 206.5, |
|
'Malaya-Speech test CER': 0.025933167255719317, |
|
'Malaya-Speech test WER': 0.0912131356803488, |
|
'Fleurs MY-MS CER': 0.02548791948171514, |
|
'Fleurs MY-MS WER': 0.08376713097429746, |
|
}, |
|
{ |
|
'model': 'mesolitica/conformer-medium-malay-whisper', |
|
'model size FP16 (MB)': 121.5, |
|
'Malaya-Speech test CER': 0.024955598713609053, |
|
'Malaya-Speech test WER': 0.09315638444736804, |
|
'Fleurs MY-MS CER': 0.029205645523910067, |
|
'Fleurs MY-MS WER': 0.09253131557833799, |
|
}, |
|
{ |
|
'model': 'mesolitica/conformer-medium-mixed', |
|
'model size FP16 (MB)': 121.5, |
|
'Malaya-Speech test CER': 0.034618711056551774, |
|
'Malaya-Speech test WER': 0.11179440626161938, |
|
'Fleurs MY-MS CER': 0.032894184549728075, |
|
'Fleurs MY-MS WER': 0.1026977414887425, |
|
}, |
|
{ |
|
'model': 'mesolitica/conformer-tiny-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3', |
|
'model size FP16 (MB)': 7.9, |
|
'Malaya-Speech test CER': 0.0612581761581601, |
|
'Malaya-Speech test WER': 0.21302693966628394, |
|
'Fleurs MY-MS CER': 0.07573301800412188, |
|
'Fleurs MY-MS WER': 0.2527434609577528, |
|
}, |
|
{ |
|
'model': 'mesolitica/conformer-12M-ctc + mesolitica/kenlm-pseudolabel-whisper-large-v3', |
|
'model size FP16 (MB)': 24.2, |
|
'Malaya-Speech test CER': 0.06941749946814912, |
|
'Malaya-Speech test WER': 0.22261096523391607, |
|
'Fleurs MY-MS CER': 0.07657934690019219, |
|
'Fleurs MY-MS WER': 0.263075623142674, |
|
}, |
|
] |
|
|
|
data = pd.DataFrame(open_source) |
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
gr.DataFrame(data) |
|
|
|
demo.launch() |