|
import gradio as gr |
|
import pandas as pd |
|
from css_html_js import custom_css |
|
|
|
TITLE = """<h1 align="center" id="space-title">π²πΎ Malay LLM Leaderboard</h1>""" |
|
|
|
INTRODUCTION_TEXT = """ |
|
π The π²πΎ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook. |
|
|
|
## Dataset |
|
|
|
π We evaluate models based on 3 datasets, |
|
|
|
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3 |
|
- This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language. |
|
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com |
|
- This test is general test for malay grammar. |
|
3. General high school science questions, contains 323 questions, https://huggingface.co/datasets/mesolitica/mysoalan.com-qa |
|
- This test is general test for science. |
|
4. Translated MMLU, https://huggingface.co/datasets/mesolitica/translated-MMLU |
|
- This test is to test general knowledge, originally from MMLU. |
|
|
|
## Contributions |
|
|
|
1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani |
|
2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102 |
|
|
|
## Tagging |
|
|
|
π’ pretrained β instruction-tuned π¦ close sourced |
|
""" |
|
|
|
close_source = [ |
|
{ |
|
'T': 'π¦', |
|
'model': 'claude-3-opus-20240229', |
|
'BM-PT3 0-shot': 57.41, |
|
'BM-PT3 1-shot': 53.70, |
|
'BM-PT3 3-shots': 62.96, |
|
'Tatabahasa 0-shot': 77.08, |
|
'Tatabahasa 1-shot': 73.93, |
|
'Tatabahasa 3-shots': 75.64, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'claude-3-sonnet-20240229', |
|
'BM-PT3 0-shot': 48.15, |
|
'BM-PT3 1-shot': 50.00, |
|
'BM-PT3 3-shots': 37.04, |
|
'Tatabahasa 0-shot': 65.90, |
|
'Tatabahasa 1-shot': 38.40, |
|
'Tatabahasa 3-shots': 40.97, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'claude-3-haiku-20240307', |
|
'BM-PT3 0-shot': 48.15, |
|
'BM-PT3 1-shot': 50.00, |
|
'BM-PT3 3-shots': 50.00, |
|
'Tatabahasa 0-shot': 62.75, |
|
'Tatabahasa 1-shot': 49.86, |
|
'Tatabahasa 3-shots': 24.07, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'AWS Bedrock Claude 1.3', |
|
'Tatabahasa 0-shot': 60.650887573964496, |
|
'Tatabahasa 1-shot': 62.46418338108882, |
|
'Tatabahasa 3-shots': 67.34104046242774, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'AWS Bedrock Claude 2', |
|
'Tatabahasa 0-shot': 61.702127659574465, |
|
'Tatabahasa 1-shot': 60.17191977077364, |
|
'Tatabahasa 3-shots': 59.598853868194844, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'gpt-4-1106-preview', |
|
'BM-PT3 0-shot': 51.85185185185185, |
|
'BM-PT3 1-shot': 66.66666666666666, |
|
'BM-PT3 3-shots': 55.55555555555556, |
|
'Tatabahasa 0-shot': 75.64469914040114, |
|
'Tatabahasa 1-shot': 73.63896848137536, |
|
'Tatabahasa 3-shots': 75.64469914040114, |
|
}, |
|
{ |
|
'T': 'π¦', |
|
'model': 'gpt-3.5-turbo-0613', |
|
'BM-PT3 0-shot': 36.53846153846153, |
|
'BM-PT3 1-shot': 28.846153846153843, |
|
'BM-PT3 3-shots': 24.528301886792452, |
|
'Tatabahasa 0-shot': 59.530791788856305, |
|
'Tatabahasa 1-shot': 60.80691642651297, |
|
'Tatabahasa 3-shots': 63.03724928366762, |
|
}, |
|
] |
|
|
|
open_source = [ |
|
{ |
|
'T': 'π’', |
|
'model': '[meta-llama/llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)', |
|
'Tatabahasa 0-shot': 24.355300859598856, |
|
'Tatabahasa 1-shot': 28.08022922636103, |
|
'Tatabahasa 3-shots': 24.641833810888254, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/tinyllama-1.1b-4096-fpf](https://huggingface.co/mesolitica/tinyllama-1.1b-4096-fpf)', |
|
'Tatabahasa 0-shot': 23.248407643312103, |
|
'Tatabahasa 1-shot': 27.22063037249284, |
|
'Tatabahasa 3-shots': 24.355300859598856, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 20.37037037037037, |
|
'BM-PT3 3-shots': 29.629629629629626, |
|
'Tatabahasa 0-shot': 17.765042979942695, |
|
'Tatabahasa 1-shot': 24.068767908309454, |
|
'Tatabahasa 3-shots': 27.507163323782237, |
|
}, |
|
{ |
|
'T': 'β', |
|
'model': '[mesolitica/malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-v2)', |
|
'BM-PT3 0-shot': 33.33333333333333, |
|
'BM-PT3 1-shot': 37.03703703703704, |
|
'BM-PT3 3-shots': 35.18518518518518, |
|
'Tatabahasa 0-shot': 59.31232091690545, |
|
'Tatabahasa 1-shot': 53.86819484240688, |
|
'Tatabahasa 3-shots': 45.55873925501432, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)', |
|
'BM-PT3 0-shot': 33.33333333333333, |
|
'BM-PT3 1-shot': 20.37037037037037, |
|
'BM-PT3 3-shots': 31.48148148148148, |
|
'Tatabahasa 0-shot': 26.07449856733524, |
|
'Tatabahasa 1-shot': 25.214899713467048, |
|
'Tatabahasa 3-shots': 24.355300859598856, |
|
}, |
|
{ |
|
'T': 'β', |
|
'model': '[mistralai/malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)', |
|
'BM-PT3 0-shot': 28.57142857142857, |
|
'BM-PT3 1-shot': 12.244897959183673, |
|
'BM-PT3 3-shots': 17.307692307692307, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mistralai/mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)', |
|
'Tatabahasa 0-shot': 28.939828080229223, |
|
'Tatabahasa 1-shot': 34.38395415472779, |
|
'Tatabahasa 3-shots': 32.95128939828081, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 22.22222222222222, |
|
'BM-PT3 3-shots': 33.33333333333333, |
|
'Tatabahasa 0-shot': 21.48997134670487, |
|
'Tatabahasa 1-shot': 28.939828080229223, |
|
'Tatabahasa 3-shots': 24.641833810888254, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)', |
|
'BM-PT3 0-shot': 16.666666666666664, |
|
'BM-PT3 1-shot': 16.666666666666664, |
|
'BM-PT3 3-shots': 25.925925925925924, |
|
'Tatabahasa 0-shot': 18.624641833810887, |
|
'Tatabahasa 1-shot': 24.355300859598856, |
|
'Tatabahasa 3-shots': 28.653295128939828, |
|
}, |
|
{ |
|
'T': 'β', |
|
'model': '[mesolitica/malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)', |
|
'BM-PT3 0-shot': 40.74074074074074, |
|
'BM-PT3 1-shot': 33.33333333333333, |
|
'BM-PT3 3-shots': 37.03703703703704, |
|
'Tatabahasa 0-shot': 65.32951289398281, |
|
'Tatabahasa 1-shot': 57.306590257879655, |
|
'Tatabahasa 3-shots': 56.446991404011456, |
|
}, |
|
{ |
|
'T': 'β', |
|
'model': '[mesolitica/malaysian-mistral-7b-32k-instructions-v4](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)', |
|
'BM-PT3 0-shot': 35.18518518518518, |
|
'BM-PT3 1-shot': 31.48148148148148, |
|
'BM-PT3 3-shots': 33.33333333333333, |
|
'Tatabahasa 0-shot': 66.4756446991404, |
|
'Tatabahasa 1-shot': 54.15472779369628, |
|
'Tatabahasa 3-shots': 49.8567335243553, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 25.925925925925924, |
|
'BM-PT3 3-shots': 31.48148148148148, |
|
'Tatabahasa 0-shot': 21.776504297994272, |
|
'Tatabahasa 1-shot': 21.776504297994272, |
|
'Tatabahasa 3-shots': 24.641833810888254, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)', |
|
'BM-PT3 0-shot': 20.37037037037037, |
|
'BM-PT3 1-shot': 24.074074074074073, |
|
'BM-PT3 3-shots': 33.33333333333333, |
|
'Tatabahasa 0-shot': 25.787965616045845, |
|
'Tatabahasa 1-shot': 27.507163323782237, |
|
'Tatabahasa 3-shots': 26.07449856733524, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/mallam-1.1B-4096](https://huggingface.co/mesolitica/mallam-1.1B-4096)', |
|
'Tatabahasa 0-shot': 25.757575757575758, |
|
'Tatabahasa 1-shot': 25.787965616045845, |
|
'Tatabahasa 3-shots': 28.08022922636103, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/mallam-3B-4096](https://huggingface.co/mesolitica/mallam-3B-4096)', |
|
'Tatabahasa 0-shot': 24.567474048442904, |
|
'Tatabahasa 1-shot': 24.641833810888254, |
|
'Tatabahasa 3-shots': 28.653295128939828, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)', |
|
'Tatabahasa 0-shot': 24.074074074074073, |
|
'Tatabahasa 1-shot': 27.793696275071632, |
|
'Tatabahasa 3-shots': 28.653295128939828, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[sail/Sailor-0.5B](https://huggingface.co/sail/Sailor-0.5B)', |
|
'Tatabahasa 0-shot': 17.191977077363894, |
|
'Tatabahasa 1-shot': 23.78223495702006, |
|
'Tatabahasa 3-shots': 25.501432664756447, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[sail/Sailor-1.8B](https://huggingface.co/sail/Sailor-1.8B)', |
|
'Tatabahasa 0-shot': 29.512893982808023, |
|
'Tatabahasa 1-shot': 27.507163323782237, |
|
'Tatabahasa 3-shots': 24.92836676217765, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[sail/Sailor-4B](https://huggingface.co/sail/Sailor-4B)', |
|
'Tatabahasa 0-shot': 31.51862464183381, |
|
'Tatabahasa 1-shot': 36.10315186246418, |
|
'Tatabahasa 3-shots': 27.507163323782237, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[sail/Sailor-7B](https://huggingface.co/sail/Sailor-7B)', |
|
'Tatabahasa 0-shot': 55.30085959885387, |
|
'Tatabahasa 1-shot': 54.72779369627507, |
|
'Tatabahasa 3-shots': 59.02578796561605, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)', |
|
'Tatabahasa 0-shot': 24.074074074074073, |
|
'Tatabahasa 1-shot': 27.793696275071632, |
|
'Tatabahasa 3-shots': 28.653295128939828, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/gemma-2B-8192-fpf](https://huggingface.co/mesolitica/gemma-2B-8192-fpf)', |
|
'Tatabahasa 0-shot': 14.613180515759314, |
|
'Tatabahasa 1-shot': 25.501432664756447, |
|
'Tatabahasa 3-shots': 23.49570200573066, |
|
}, |
|
{ |
|
'T': 'π’', |
|
'model': '[mesolitica/Qwen1.5-0.5B-4096-fpf](https://huggingface.co/mesolitica/Qwen1.5-0.5B-4096-fpf)', |
|
'Tatabahasa 0-shot': 13.753581661891118, |
|
'Tatabahasa 1-shot': 21.20343839541547, |
|
'Tatabahasa 3-shots': 22.636103151862464, |
|
}, |
|
{ |
|
'T': 'β', |
|
'model': '[mesolitica/mallam-1.1b-20k-instructions](https://huggingface.co/mesolitica/mallam-1.1b-20k-instructions)', |
|
'Tatabahasa 0-shot': 26.923076923076923, |
|
'Tatabahasa 1-shot': 28.939828080229223, |
|
'Tatabahasa 3-shots': 21.776504297994272, |
|
}, |
|
] |
|
|
|
data = pd.DataFrame(close_source + open_source) |
|
|
|
demo = gr.Blocks(css=custom_css) |
|
with demo: |
|
gr.HTML(TITLE) |
|
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") |
|
gr.DataFrame(data, datatype = 'markdown') |
|
|
|
demo.launch() |