Spaces:

mesolitica
/

malay-llm-leaderboard

Running

App Files Files Community

huseinzol05 commited on Nov 24, 2023

Commit

fc64eda

•

1 Parent(s): 7963463

fix

Browse files

Files changed (1) hide show

app.py +143 -146

app.py CHANGED Viewed

@@ -1,158 +1,155 @@
-# import gradio as gr
-# import pandas as pd
-# from css_html_js import custom_css
-# TITLE = """<h1 align="center" id="space-title">🇲🇾 Malay LLM Leaderboard</h1>"""
-# INTRODUCTION_TEXT = """
-# 📐 The 🇲🇾 Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
-# ## Dataset
-# 📈 We evaluate models based on 3 datasets,
-# 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
-# - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
-# 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
-# - This test is general test for malay grammar.
-# 3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
-# - This test is general test to language reasoning.
-# 4. HumanEval, https://github.com/openai/human-eval
-# - This test is for programming language understanding.
-# """
-# close_source = [
-#     {
-#         'model': 'gpt-4-1106-preview',
-#         'BM-PT3 0-shot': 51.85185185185185,
-#         'BM-PT3 1-shot': 66.66666666666666,
-#         'BM-PT3 3-shots': 55.55555555555556,
-#         'Tatabahasa 0-shot': 75.64469914040114,
-#         'Tatabahasa 1-shot': 73.63896848137536,
-#         'Tatabahasa 3-shots': 75.64469914040114,
-#     },
-#     {
-#         'model': 'gpt-3.5-turbo-0613',
-#         'BM-PT3 0-shot': 36.53846153846153,
-#         'BM-PT3 1-shot': 28.846153846153843,
-#         'BM-PT3 3-shots': 24.528301886792452,
-#         'Tatabahasa 0-shot': 59.530791788856305,
-#         'Tatabahasa 1-shot': 60.80691642651297,
-#         'Tatabahasa 3-shots': 63.03724928366762,
-#     },
-#     {
-#         'model': 'Antrophic Claude 2',
-#         'Tatabahasa 0-shot': 61,
-#         'Tatabahasa 3-shots': 57.8,
-#     },
-#     {
-#         'model': 'Antrophic Claude 1',
-#         'Tatabahasa 3-shots': 67,
-#     },
-# ]
-# open_source = [
-#     {
-#         'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
-#         'Tatabahasa 0-shot': 24.355300859598856,
-#         'Tatabahasa 1-shot': 28.08022922636103,
-#         'Tatabahasa 3-shots': 24.641833810888254,
-#     },
-#     {
-#         'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
-#         'BM-PT3 0-shot': 20.37037037037037,
-#         'BM-PT3 1-shot': 20.37037037037037,
-#         'BM-PT3 3-shots': 29.629629629629626,
-#         'Tatabahasa 0-shot': 17.765042979942695,
-#         'Tatabahasa 1-shot': 24.068767908309454,
-#         'Tatabahasa 3-shots': 27.507163323782237,
-#     },
-#     {
-#         'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
-#         'BM-PT3 0-shot': 35.294117647058826,
-#         'BM-PT3 1-shot': 21.153846153846153,
-#         'BM-PT3 3-shots': 28.30188679245283,
-#     },
-#     {
-#         'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
-#         'BM-PT3 0-shot': 33.33333333333333,
-#         'BM-PT3 1-shot': 20.37037037037037,
-#         'BM-PT3 3-shots': 31.48148148148148,
-#         'Tatabahasa 0-shot': 26.07449856733524,
-#         'Tatabahasa 1-shot': 25.214899713467048,
-#         'Tatabahasa 3-shots': 24.355300859598856,
-#     },
-#     {
-#         'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
-#         'BM-PT3 0-shot': 28.57142857142857,
-#         'BM-PT3 1-shot': 12.244897959183673,
-#         'BM-PT3 3-shots': 17.307692307692307,
-#     },
-#     {
-#         'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
-#         'Tatabahasa 0-shot': 28.939828080229223,
-#         'Tatabahasa 1-shot': 34.38395415472779,
-#         'Tatabahasa 3-shots': 32.95128939828081,
-#     },
-#     {
-#         'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
-#         'BM-PT3 0-shot': 20.37037037037037,
-#         'BM-PT3 1-shot': 22.22222222222222,
-#         'BM-PT3 3-shots': 33.33333333333333,
-#         'Tatabahasa 0-shot': 21.48997134670487,
-#         'Tatabahasa 1-shot': 28.939828080229223,
-#         'Tatabahasa 3-shots': 24.641833810888254,
-#     },
-#     {
-#         'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
-#         'BM-PT3 0-shot': 16.666666666666664,
-#         'BM-PT3 1-shot': 16.666666666666664,
-#         'BM-PT3 3-shots': 25.925925925925924,
-#         'Tatabahasa 0-shot': 18.624641833810887,
-#         'Tatabahasa 1-shot': 24.355300859598856,
-#         'Tatabahasa 3-shots': 28.653295128939828,
-#     },
-#     {
-#         'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
-#         'BM-PT3 0-shot': 35.18518518518518,
-#         'BM-PT3 1-shot': 33.33333333333333,
-#         'BM-PT3 3-shots': 37.03703703703704,
-#         'Tatabahasa 0-shot': 55.014326647564474,
-#         'Tatabahasa 1-shot': 42.693409742120345,
-#         'Tatabahasa 3-shots': 33.33333333333333,
-#     },
-#     {
-#         'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
-#         'BM-PT3 0-shot': 20.37037037037037,
-#         'BM-PT3 1-shot': 25.925925925925924,
-#         'BM-PT3 3-shots': 31.48148148148148,
-#         'Tatabahasa 0-shot': 21.776504297994272,
-#         'Tatabahasa 1-shot': 21.776504297994272,
-#         'Tatabahasa 3-shots': 24.641833810888254,
-#     },
-#     {
-#         'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
-#         'BM-PT3 0-shot': 20.37037037037037,
-#         'BM-PT3 1-shot': 24.074074074074073,
-#         'BM-PT3 3-shots': 33.33333333333333,
-#         'Tatabahasa 0-shot': 25.787965616045845,
-#         'Tatabahasa 1-shot': 27.507163323782237,
-#         'Tatabahasa 3-shots': 26.07449856733524,
-#     }
-# ]
-# data = pd.DataFrame(close_source + open_source)
-# demo = gr.Blocks(css=custom_css)
-# with demo:
-#     gr.HTML(TITLE)
-#     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-#     gr.DataFrame(data, datatype = 'markdown')
-# demo.launch()
-import gradio as gr
-demo = gr.Blocks()
 with demo:
-    gr.HTML('helo')
 demo.launch()

+import gradio as gr
+import pandas as pd
+from css_html_js import custom_css
+TITLE = """<h1 align="center" id="space-title">🇲🇾 Malay LLM Leaderboard</h1>"""
+INTRODUCTION_TEXT = """
+📐 The 🇲🇾 Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
+## Dataset
+📈 We evaluate models based on 3 datasets,
+1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
+- This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
+2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
+- This test is general test for malay grammar.
+3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
+- This test is general test to language reasoning.
+4. HumanEval, https://github.com/openai/human-eval
+- This test is for programming language understanding.
+"""
+not_verify = [
+    {
+        'model': 'Antrophic Claude 2',
+        'Tatabahasa 0-shot': 61,
+        'Tatabahasa 3-shots': 57.8,
+    },
+    {
+        'model': 'Antrophic Claude 1',
+        'Tatabahasa 3-shots': 67,
+    },
+]
+close_source = [
+    {
+        'model': 'gpt-4-1106-preview',
+        'BM-PT3 0-shot': 51.85185185185185,
+        'BM-PT3 1-shot': 66.66666666666666,
+        'BM-PT3 3-shots': 55.55555555555556,
+        'Tatabahasa 0-shot': 75.64469914040114,
+        'Tatabahasa 1-shot': 73.63896848137536,
+        'Tatabahasa 3-shots': 75.64469914040114,
+    },
+    {
+        'model': 'gpt-3.5-turbo-0613',
+        'BM-PT3 0-shot': 36.53846153846153,
+        'BM-PT3 1-shot': 28.846153846153843,
+        'BM-PT3 3-shots': 24.528301886792452,
+        'Tatabahasa 0-shot': 59.530791788856305,
+        'Tatabahasa 1-shot': 60.80691642651297,
+        'Tatabahasa 3-shots': 63.03724928366762,
+    },
+]
+open_source = [
+    {
+        'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
+        'Tatabahasa 0-shot': 24.355300859598856,
+        'Tatabahasa 1-shot': 28.08022922636103,
+        'Tatabahasa 3-shots': 24.641833810888254,
+    },
+    {
+        'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
+        'BM-PT3 0-shot': 20.37037037037037,
+        'BM-PT3 1-shot': 20.37037037037037,
+        'BM-PT3 3-shots': 29.629629629629626,
+        'Tatabahasa 0-shot': 17.765042979942695,
+        'Tatabahasa 1-shot': 24.068767908309454,
+        'Tatabahasa 3-shots': 27.507163323782237,
+    },
+    {
+        'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
+        'BM-PT3 0-shot': 35.294117647058826,
+        'BM-PT3 1-shot': 21.153846153846153,
+        'BM-PT3 3-shots': 28.30188679245283,
+    },
+    {
+        'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
+        'BM-PT3 0-shot': 33.33333333333333,
+        'BM-PT3 1-shot': 20.37037037037037,
+        'BM-PT3 3-shots': 31.48148148148148,
+        'Tatabahasa 0-shot': 26.07449856733524,
+        'Tatabahasa 1-shot': 25.214899713467048,
+        'Tatabahasa 3-shots': 24.355300859598856,
+    },
+    {
+        'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
+        'BM-PT3 0-shot': 28.57142857142857,
+        'BM-PT3 1-shot': 12.244897959183673,
+        'BM-PT3 3-shots': 17.307692307692307,
+    },
+    {
+        'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
+        'Tatabahasa 0-shot': 28.939828080229223,
+        'Tatabahasa 1-shot': 34.38395415472779,
+        'Tatabahasa 3-shots': 32.95128939828081,
+    },
+    {
+        'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
+        'BM-PT3 0-shot': 20.37037037037037,
+        'BM-PT3 1-shot': 22.22222222222222,
+        'BM-PT3 3-shots': 33.33333333333333,
+        'Tatabahasa 0-shot': 21.48997134670487,
+        'Tatabahasa 1-shot': 28.939828080229223,
+        'Tatabahasa 3-shots': 24.641833810888254,
+    },
+    {
+        'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
+        'BM-PT3 0-shot': 16.666666666666664,
+        'BM-PT3 1-shot': 16.666666666666664,
+        'BM-PT3 3-shots': 25.925925925925924,
+        'Tatabahasa 0-shot': 18.624641833810887,
+        'Tatabahasa 1-shot': 24.355300859598856,
+        'Tatabahasa 3-shots': 28.653295128939828,
+    },
+    {
+        'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
+        'BM-PT3 0-shot': 35.18518518518518,
+        'BM-PT3 1-shot': 33.33333333333333,
+        'BM-PT3 3-shots': 37.03703703703704,
+        'Tatabahasa 0-shot': 55.014326647564474,
+        'Tatabahasa 1-shot': 42.693409742120345,
+        'Tatabahasa 3-shots': 33.33333333333333,
+    },
+    {
+        'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
+        'BM-PT3 0-shot': 20.37037037037037,
+        'BM-PT3 1-shot': 25.925925925925924,
+        'BM-PT3 3-shots': 31.48148148148148,
+        'Tatabahasa 0-shot': 21.776504297994272,
+        'Tatabahasa 1-shot': 21.776504297994272,
+        'Tatabahasa 3-shots': 24.641833810888254,
+    },
+    {
+        'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
+        'BM-PT3 0-shot': 20.37037037037037,
+        'BM-PT3 1-shot': 24.074074074074073,
+        'BM-PT3 3-shots': 33.33333333333333,
+        'Tatabahasa 0-shot': 25.787965616045845,
+        'Tatabahasa 1-shot': 27.507163323782237,
+        'Tatabahasa 3-shots': 26.07449856733524,
+    }
+]
+data = pd.DataFrame(close_source + open_source)
+demo = gr.Blocks(css=custom_css)
 with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    gr.DataFrame(data, datatype = 'markdown')
 demo.launch()