huseinzol05 commited on
Commit
e22d664
β€’
1 Parent(s): 869a523

added more scores

Browse files
Files changed (1) hide show
  1. app.py +25 -10
app.py CHANGED
@@ -2,25 +2,27 @@ import gradio as gr
2
  import pandas as pd
3
  from css_html_js import custom_css
4
 
5
- demo = gr.Blocks(css=custom_css)
6
-
7
  TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
8
 
9
  INTRODUCTION_TEXT = """
10
- πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
11
- πŸ€— All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
12
 
13
  ## Dataset
14
 
15
  πŸ“ˆ We evaluate models based on 3 datasets,
16
 
17
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
 
18
  2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
 
19
  3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
 
 
 
20
  """
21
 
22
- data = [
23
- {
24
  'model': 'gpt-4-1106-preview',
25
  'BM-PT3 0-shot': 51.85185185185185,
26
  'BM-PT3 1-shot': 66.66666666666666,
@@ -38,6 +40,18 @@ data = [
38
  'Tatabahasa 1-shot': 60.80691642651297,
39
  'Tatabahasa 3-shots': 63.03724928366762,
40
  },
 
 
 
 
 
 
 
 
 
 
 
 
41
  {
42
  'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
43
  'Tatabahasa 0-shot': 24.355300859598856,
@@ -103,9 +117,9 @@ data = [
103
  'BM-PT3 0-shot': 35.18518518518518,
104
  'BM-PT3 1-shot': 33.33333333333333,
105
  'BM-PT3 3-shots': 37.03703703703704,
106
- 'Tatabahasa 0-shot': 45.845272206303726,
107
- 'Tatabahasa 1-shot': 37.249283667621775,
108
- 'Tatabahasa 3-shots': 34.097421203438394,
109
  },
110
  {
111
  'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
@@ -127,8 +141,9 @@ data = [
127
  }
128
  ]
129
 
130
- data = pd.DataFrame(data)
131
 
 
132
  with demo:
133
  gr.HTML(TITLE)
134
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
2
  import pandas as pd
3
  from css_html_js import custom_css
4
 
 
 
5
  TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
6
 
7
  INTRODUCTION_TEXT = """
8
+ πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
 
9
 
10
  ## Dataset
11
 
12
  πŸ“ˆ We evaluate models based on 3 datasets,
13
 
14
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
15
+ - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
16
  2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
17
+ - This test is general test for malay grammar.
18
  3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
19
+ - This test is general test to language reasoning.
20
+ 4. HumanEval, https://github.com/openai/human-eval
21
+ - This test is for programming language understanding.
22
  """
23
 
24
+ close_source = [
25
+ {
26
  'model': 'gpt-4-1106-preview',
27
  'BM-PT3 0-shot': 51.85185185185185,
28
  'BM-PT3 1-shot': 66.66666666666666,
 
40
  'Tatabahasa 1-shot': 60.80691642651297,
41
  'Tatabahasa 3-shots': 63.03724928366762,
42
  },
43
+ {
44
+ 'model': 'Antrophic Claude 2',
45
+ 'Tatabahasa 0-shot': 61,
46
+ 'Tatabahasa 3-shots': 57.8,
47
+ },
48
+ {
49
+ 'model': 'Antrophic Claude 1',
50
+ 'Tatabahasa 3-shots': 67,
51
+ },
52
+ ]
53
+
54
+ open_source = [
55
  {
56
  'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
57
  'Tatabahasa 0-shot': 24.355300859598856,
 
117
  'BM-PT3 0-shot': 35.18518518518518,
118
  'BM-PT3 1-shot': 33.33333333333333,
119
  'BM-PT3 3-shots': 37.03703703703704,
120
+ 'Tatabahasa 0-shot': 48.13753581661891,
121
+ 'Tatabahasa 1-shot': 38.96848137535817,
122
+ 'Tatabahasa 3-shots': 33.2378223495702,
123
  },
124
  {
125
  'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
 
141
  }
142
  ]
143
 
144
+ data = pd.DataFrame(close_source + open_source)
145
 
146
+ demo = gr.Blocks(css=custom_css)
147
  with demo:
148
  gr.HTML(TITLE)
149
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")