|
import gradio as gr |
|
import pandas as pd |
|
from css_html_js import custom_css |
|
|
|
demo = gr.Blocks(css=custom_css) |
|
|
|
TITLE = """<h1 align="center" id="space-title">π²πΎ Malay LLM Leaderboard</h1>""" |
|
|
|
INTRODUCTION_TEXT = """ |
|
π The π²πΎ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n |
|
π€ All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook. |
|
|
|
## Dataset |
|
|
|
π We evaluate models based on 3 datasets, |
|
|
|
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3 |
|
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com |
|
3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli |
|
""" |
|
|
|
data = [ |
|
{ |
|
'model': 'gpt-3.5-turbo-0613', |
|
'BM-PT3 0-shot': 36.53846153846153, |
|
'BM-PT3 1-shot': 28.846153846153843, |
|
'BM-PT3 3-shots': 24.528301886792452, |
|
'Tatabahasa 0-shot': 59.530791788856305, |
|
'Tatabahasa 1-shot': 60.80691642651297, |
|
'Tatabahasa 3-shots': 63.03724928366762, |
|
}, |
|
{ |
|
'model': 'gpt-4-1106-preview', |
|
'Tatabahasa 0-shot': 75.64469914040114, |
|
'Tatabahasa 1-shot': 73.63896848137536, |
|
'Tatabahasa 3-shots': 75.64469914040114, |
|
}, |
|
{ |
|
'model': 'malaysian-llama2-7b-32k', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 20.37037037037037, |
|
'BM-PT3 3-shots': 29.629629629629626, |
|
'Tatabahasa 0-shot': 17.765042979942695, |
|
'Tatabahasa 1-shot': 24.068767908309454, |
|
'Tatabahasa 3-shots': 27.507163323782237, |
|
}, |
|
{ |
|
'model': 'malaysian-llama2-7b-32k-instructions', |
|
'BM-PT3 0-shot': 35.294117647058826, |
|
'BM-PT3 1-shot': 21.153846153846153, |
|
'BM-PT3 3-shots': 28.30188679245283, |
|
}, |
|
{ |
|
'model': 'malaysian-llama2-13b-32k', |
|
'BM-PT3 0-shot': 33.33333333333333, |
|
'BM-PT3 1-shot': 20.37037037037037, |
|
'BM-PT3 3-shots': 31.48148148148148, |
|
}, |
|
{ |
|
'model': 'malaysian-llama2-13b-32k-instructions', |
|
'BM-PT3 0-shot': 28.57142857142857, |
|
'BM-PT3 1-shot': 12.244897959183673, |
|
'BM-PT3 3-shots': 17.307692307692307, |
|
}, |
|
{ |
|
'model': 'malaysian-mistral-7b-4k', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 22.22222222222222, |
|
'BM-PT3 3-shots': 33.33333333333333, |
|
'Tatabahasa 0-shot': 21.48997134670487, |
|
'Tatabahasa 1-shot': 28.939828080229223, |
|
'Tatabahasa 3-shots': 24.641833810888254, |
|
}, |
|
{ |
|
'model': 'malaysian-mistral-7b-32k', |
|
'BM-PT3 0-shot': 16.666666666666664, |
|
'BM-PT3 1-shot': 16.666666666666664, |
|
'BM-PT3 3-shots': 25.925925925925924, |
|
}, |
|
{ |
|
'model': 'malaysian-mistral-7b-32k-instructions', |
|
'BM-PT3 0-shot': 21.568627450980394, |
|
'BM-PT3 1-shot': 31.25, |
|
'BM-PT3 3-shots': 28.000000000000004, |
|
} |
|
] |
|
|
|
data = pd.DataFrame(data) |
|
|
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
gr.DataFrame(data) |
|
|
|
demo.launch() |