huseinzol05's picture
Merge branch 'main' of https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard into main
0f4fb69
import gradio as gr
import pandas as pd
from css_html_js import custom_css
TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
INTRODUCTION_TEXT = """
πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
## Dataset
πŸ“ˆ We evaluate models based on 3 datasets,
1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
- This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
- This test is general test for malay grammar.
3. Translated MMLU, https://huggingface.co/datasets/mesolitica/translated-MMLU
- This test is to test general knowledge, originally from MMLU.
## Contributions
1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani
2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102
## Tagging
🟒 pretrained β­• instruction-tuned πŸ“¦ close sourced
"""
close_source = [
{
'T': 'πŸ“¦',
'model': 'claude-3-opus-20240229',
'BM-PT3 0-shot': 57.41,
'BM-PT3 1-shot': 53.70,
'BM-PT3 3-shots': 62.96,
'Tatabahasa 0-shot': 77.08,
'Tatabahasa 1-shot': 73.93,
'Tatabahasa 3-shots': 75.64,
},
{
'T': 'πŸ“¦',
'model': 'claude-3-sonnet-20240229',
'BM-PT3 0-shot': 48.15,
'BM-PT3 1-shot': 50.00,
'BM-PT3 3-shots': 37.04,
'Tatabahasa 0-shot': 65.90,
'Tatabahasa 1-shot': 38.40,
'Tatabahasa 3-shots': 40.97,
},
{
'T': 'πŸ“¦',
'model': 'claude-3-haiku-20240307',
'BM-PT3 0-shot': 48.15,
'BM-PT3 1-shot': 50.00,
'BM-PT3 3-shots': 50.00,
'Tatabahasa 0-shot': 62.75,
'Tatabahasa 1-shot': 49.86,
'Tatabahasa 3-shots': 24.07,
},
{
'T': 'πŸ“¦',
'model': 'AWS Bedrock Claude 1.3',
'Tatabahasa 0-shot': 60.650887573964496,
'Tatabahasa 1-shot': 62.46418338108882,
'Tatabahasa 3-shots': 67.34104046242774,
},
{
'T': 'πŸ“¦',
'model': 'AWS Bedrock Claude 2',
'Tatabahasa 0-shot': 61.702127659574465,
'Tatabahasa 1-shot': 60.17191977077364,
'Tatabahasa 3-shots': 59.598853868194844,
},
{
'T': 'πŸ“¦',
'model': 'gpt-4-1106-preview',
'BM-PT3 0-shot': 51.85185185185185,
'BM-PT3 1-shot': 66.66666666666666,
'BM-PT3 3-shots': 55.55555555555556,
'Tatabahasa 0-shot': 75.64469914040114,
'Tatabahasa 1-shot': 73.63896848137536,
'Tatabahasa 3-shots': 75.64469914040114,
},
{
'T': 'πŸ“¦',
'model': 'gpt-3.5-turbo-0613',
'BM-PT3 0-shot': 36.53846153846153,
'BM-PT3 1-shot': 28.846153846153843,
'BM-PT3 3-shots': 24.528301886792452,
'Tatabahasa 0-shot': 59.530791788856305,
'Tatabahasa 1-shot': 60.80691642651297,
'Tatabahasa 3-shots': 63.03724928366762,
},
]
open_source = [
{
'T': '🟒',
'model': '[meta-llama/llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
'Tatabahasa 0-shot': 24.355300859598856,
'Tatabahasa 1-shot': 28.08022922636103,
'Tatabahasa 3-shots': 24.641833810888254,
},
{
'T': '🟒',
'model': '[mesolitica/tinyllama-1.1b-4096-fpf](https://huggingface.co/mesolitica/tinyllama-1.1b-4096-fpf)',
'Tatabahasa 0-shot': 23.248407643312103,
'Tatabahasa 1-shot': 27.22063037249284,
'Tatabahasa 3-shots': 24.355300859598856,
},
{
'T': '🟒',
'model': '[mesolitica/malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
'BM-PT3 0-shot': 20.37037037037037,
'BM-PT3 1-shot': 20.37037037037037,
'BM-PT3 3-shots': 29.629629629629626,
'Tatabahasa 0-shot': 17.765042979942695,
'Tatabahasa 1-shot': 24.068767908309454,
'Tatabahasa 3-shots': 27.507163323782237,
},
{
'T': 'β­•',
'model': '[mesolitica/malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-v2)',
'BM-PT3 0-shot': 33.33333333333333,
'BM-PT3 1-shot': 37.03703703703704,
'BM-PT3 3-shots': 35.18518518518518,
'Tatabahasa 0-shot': 59.31232091690545,
'Tatabahasa 1-shot': 53.86819484240688,
'Tatabahasa 3-shots': 45.55873925501432,
},
{
'T': '🟒',
'model': '[mesolitica/malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
'BM-PT3 0-shot': 33.33333333333333,
'BM-PT3 1-shot': 20.37037037037037,
'BM-PT3 3-shots': 31.48148148148148,
'Tatabahasa 0-shot': 26.07449856733524,
'Tatabahasa 1-shot': 25.214899713467048,
'Tatabahasa 3-shots': 24.355300859598856,
},
{
'T': 'β­•',
'model': '[mistralai/malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
'BM-PT3 0-shot': 28.57142857142857,
'BM-PT3 1-shot': 12.244897959183673,
'BM-PT3 3-shots': 17.307692307692307,
},
{
'T': '🟒',
'model': '[mistralai/mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
'Tatabahasa 0-shot': 28.939828080229223,
'Tatabahasa 1-shot': 34.38395415472779,
'Tatabahasa 3-shots': 32.95128939828081,
},
{
'T': '🟒',
'model': '[mesolitica/malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
'BM-PT3 0-shot': 20.37037037037037,
'BM-PT3 1-shot': 22.22222222222222,
'BM-PT3 3-shots': 33.33333333333333,
'Tatabahasa 0-shot': 21.48997134670487,
'Tatabahasa 1-shot': 28.939828080229223,
'Tatabahasa 3-shots': 24.641833810888254,
},
{
'T': '🟒',
'model': '[mesolitica/malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
'BM-PT3 0-shot': 16.666666666666664,
'BM-PT3 1-shot': 16.666666666666664,
'BM-PT3 3-shots': 25.925925925925924,
'Tatabahasa 0-shot': 18.624641833810887,
'Tatabahasa 1-shot': 24.355300859598856,
'Tatabahasa 3-shots': 28.653295128939828,
},
{
'T': 'β­•',
'model': '[mesolitica/malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
'BM-PT3 0-shot': 40.74074074074074,
'BM-PT3 1-shot': 33.33333333333333,
'BM-PT3 3-shots': 37.03703703703704,
'Tatabahasa 0-shot': 65.32951289398281,
'Tatabahasa 1-shot': 57.306590257879655,
'Tatabahasa 3-shots': 56.446991404011456,
},
{
'T': 'β­•',
'model': '[mesolitica/malaysian-mistral-7b-32k-instructions-v4](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
'BM-PT3 0-shot': 35.18518518518518,
'BM-PT3 1-shot': 31.48148148148148,
'BM-PT3 3-shots': 33.33333333333333,
'Tatabahasa 0-shot': 66.4756446991404,
'Tatabahasa 1-shot': 54.15472779369628,
'Tatabahasa 3-shots': 49.8567335243553,
},
{
'T': '🟒',
'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
'BM-PT3 0-shot': 20.37037037037037,
'BM-PT3 1-shot': 25.925925925925924,
'BM-PT3 3-shots': 31.48148148148148,
'Tatabahasa 0-shot': 21.776504297994272,
'Tatabahasa 1-shot': 21.776504297994272,
'Tatabahasa 3-shots': 24.641833810888254,
},
{
'T': '🟒',
'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
'BM-PT3 0-shot': 20.37037037037037,
'BM-PT3 1-shot': 24.074074074074073,
'BM-PT3 3-shots': 33.33333333333333,
'Tatabahasa 0-shot': 25.787965616045845,
'Tatabahasa 1-shot': 27.507163323782237,
'Tatabahasa 3-shots': 26.07449856733524,
},
{
'T': '🟒',
'model': '[mesolitica/mallam-1.1B-4096](https://huggingface.co/mesolitica/mallam-1.1B-4096)',
'Tatabahasa 0-shot': 25.757575757575758,
'Tatabahasa 1-shot': 25.787965616045845,
'Tatabahasa 3-shots': 28.08022922636103,
},
{
'T': '🟒',
'model': '[mesolitica/mallam-3B-4096](https://huggingface.co/mesolitica/mallam-3B-4096)',
'Tatabahasa 0-shot': 24.567474048442904,
'Tatabahasa 1-shot': 24.641833810888254,
'Tatabahasa 3-shots': 28.653295128939828,
},
{
'T': '🟒',
'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
'Tatabahasa 0-shot': 24.074074074074073,
'Tatabahasa 1-shot': 27.793696275071632,
'Tatabahasa 3-shots': 28.653295128939828,
},
{
'T': '🟒',
'model': '[sail/Sailor-0.5B](https://huggingface.co/sail/Sailor-0.5B)',
'Tatabahasa 0-shot': 17.191977077363894,
'Tatabahasa 1-shot': 23.78223495702006,
'Tatabahasa 3-shots': 25.501432664756447,
},
{
'T': '🟒',
'model': '[sail/Sailor-1.8B](https://huggingface.co/sail/Sailor-1.8B)',
'Tatabahasa 0-shot': 29.512893982808023,
'Tatabahasa 1-shot': 27.507163323782237,
'Tatabahasa 3-shots': 24.92836676217765,
},
{
'T': '🟒',
'model': '[sail/Sailor-4B](https://huggingface.co/sail/Sailor-4B)',
'Tatabahasa 0-shot': 31.51862464183381,
'Tatabahasa 1-shot': 36.10315186246418,
'Tatabahasa 3-shots': 27.507163323782237,
},
{
'T': '🟒',
'model': '[sail/Sailor-7B](https://huggingface.co/sail/Sailor-7B)',
'Tatabahasa 0-shot': 55.30085959885387,
'Tatabahasa 1-shot': 54.72779369627507,
'Tatabahasa 3-shots': 59.02578796561605,
},
{
'T': '🟒',
'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
'Tatabahasa 0-shot': 24.074074074074073,
'Tatabahasa 1-shot': 27.793696275071632,
'Tatabahasa 3-shots': 28.653295128939828,
},
{
'T': '🟒',
'model': '[mesolitica/gemma-2B-8192-fpf](https://huggingface.co/mesolitica/gemma-2B-8192-fpf)',
'Tatabahasa 0-shot': 14.613180515759314,
'Tatabahasa 1-shot': 25.501432664756447,
'Tatabahasa 3-shots': 23.49570200573066,
},
{
'T': '🟒',
'model': '[mesolitica/Qwen1.5-0.5B-4096-fpf](https://huggingface.co/mesolitica/Qwen1.5-0.5B-4096-fpf)',
'Tatabahasa 0-shot': 13.753581661891118,
'Tatabahasa 1-shot': 21.20343839541547,
'Tatabahasa 3-shots': 22.636103151862464,
},
{
'T': 'β­•',
'model': '[mesolitica/mallam-1.1b-20k-instructions](https://huggingface.co/mesolitica/mallam-1.1b-20k-instructions)',
'Tatabahasa 0-shot': 26.923076923076923,
'Tatabahasa 1-shot': 28.939828080229223,
'Tatabahasa 3-shots': 21.776504297994272,
},
]
data = pd.DataFrame(close_source + open_source)
demo = gr.Blocks(css=custom_css)
with demo:
gr.HTML(TITLE)
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
gr.DataFrame(data, datatype = 'markdown')
demo.launch()