huseinzol05 commited on
Commit
fc64eda
β€’
1 Parent(s): 7963463
Files changed (1) hide show
  1. app.py +143 -146
app.py CHANGED
@@ -1,158 +1,155 @@
1
- # import gradio as gr
2
- # import pandas as pd
3
- # from css_html_js import custom_css
4
-
5
- # TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
6
 
7
- # INTRODUCTION_TEXT = """
8
- # πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
 
10
- # ## Dataset
 
11
 
12
- # πŸ“ˆ We evaluate models based on 3 datasets,
13
 
14
- # 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
15
- # - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
16
- # 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
17
- # - This test is general test for malay grammar.
18
- # 3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
19
- # - This test is general test to language reasoning.
20
- # 4. HumanEval, https://github.com/openai/human-eval
21
- # - This test is for programming language understanding.
22
- # """
23
 
24
- # close_source = [
25
- # {
26
- # 'model': 'gpt-4-1106-preview',
27
- # 'BM-PT3 0-shot': 51.85185185185185,
28
- # 'BM-PT3 1-shot': 66.66666666666666,
29
- # 'BM-PT3 3-shots': 55.55555555555556,
30
- # 'Tatabahasa 0-shot': 75.64469914040114,
31
- # 'Tatabahasa 1-shot': 73.63896848137536,
32
- # 'Tatabahasa 3-shots': 75.64469914040114,
33
- # },
34
- # {
35
- # 'model': 'gpt-3.5-turbo-0613',
36
- # 'BM-PT3 0-shot': 36.53846153846153,
37
- # 'BM-PT3 1-shot': 28.846153846153843,
38
- # 'BM-PT3 3-shots': 24.528301886792452,
39
- # 'Tatabahasa 0-shot': 59.530791788856305,
40
- # 'Tatabahasa 1-shot': 60.80691642651297,
41
- # 'Tatabahasa 3-shots': 63.03724928366762,
42
- # },
43
- # {
44
- # 'model': 'Antrophic Claude 2',
45
- # 'Tatabahasa 0-shot': 61,
46
- # 'Tatabahasa 3-shots': 57.8,
47
- # },
48
- # {
49
- # 'model': 'Antrophic Claude 1',
50
- # 'Tatabahasa 3-shots': 67,
51
- # },
52
- # ]
53
 
54
- # open_source = [
55
- # {
56
- # 'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
57
- # 'Tatabahasa 0-shot': 24.355300859598856,
58
- # 'Tatabahasa 1-shot': 28.08022922636103,
59
- # 'Tatabahasa 3-shots': 24.641833810888254,
60
- # },
61
- # {
62
- # 'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
63
- # 'BM-PT3 0-shot': 20.37037037037037,
64
- # 'BM-PT3 1-shot': 20.37037037037037,
65
- # 'BM-PT3 3-shots': 29.629629629629626,
66
- # 'Tatabahasa 0-shot': 17.765042979942695,
67
- # 'Tatabahasa 1-shot': 24.068767908309454,
68
- # 'Tatabahasa 3-shots': 27.507163323782237,
69
- # },
70
- # {
71
- # 'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
72
- # 'BM-PT3 0-shot': 35.294117647058826,
73
- # 'BM-PT3 1-shot': 21.153846153846153,
74
- # 'BM-PT3 3-shots': 28.30188679245283,
75
- # },
76
- # {
77
- # 'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
78
- # 'BM-PT3 0-shot': 33.33333333333333,
79
- # 'BM-PT3 1-shot': 20.37037037037037,
80
- # 'BM-PT3 3-shots': 31.48148148148148,
81
- # 'Tatabahasa 0-shot': 26.07449856733524,
82
- # 'Tatabahasa 1-shot': 25.214899713467048,
83
- # 'Tatabahasa 3-shots': 24.355300859598856,
84
- # },
85
- # {
86
- # 'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
87
- # 'BM-PT3 0-shot': 28.57142857142857,
88
- # 'BM-PT3 1-shot': 12.244897959183673,
89
- # 'BM-PT3 3-shots': 17.307692307692307,
90
- # },
91
- # {
92
- # 'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
93
- # 'Tatabahasa 0-shot': 28.939828080229223,
94
- # 'Tatabahasa 1-shot': 34.38395415472779,
95
- # 'Tatabahasa 3-shots': 32.95128939828081,
96
- # },
97
- # {
98
- # 'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
99
- # 'BM-PT3 0-shot': 20.37037037037037,
100
- # 'BM-PT3 1-shot': 22.22222222222222,
101
- # 'BM-PT3 3-shots': 33.33333333333333,
102
- # 'Tatabahasa 0-shot': 21.48997134670487,
103
- # 'Tatabahasa 1-shot': 28.939828080229223,
104
- # 'Tatabahasa 3-shots': 24.641833810888254,
105
- # },
106
- # {
107
- # 'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
108
- # 'BM-PT3 0-shot': 16.666666666666664,
109
- # 'BM-PT3 1-shot': 16.666666666666664,
110
- # 'BM-PT3 3-shots': 25.925925925925924,
111
- # 'Tatabahasa 0-shot': 18.624641833810887,
112
- # 'Tatabahasa 1-shot': 24.355300859598856,
113
- # 'Tatabahasa 3-shots': 28.653295128939828,
114
- # },
115
- # {
116
- # 'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
117
- # 'BM-PT3 0-shot': 35.18518518518518,
118
- # 'BM-PT3 1-shot': 33.33333333333333,
119
- # 'BM-PT3 3-shots': 37.03703703703704,
120
- # 'Tatabahasa 0-shot': 55.014326647564474,
121
- # 'Tatabahasa 1-shot': 42.693409742120345,
122
- # 'Tatabahasa 3-shots': 33.33333333333333,
123
- # },
124
- # {
125
- # 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
126
- # 'BM-PT3 0-shot': 20.37037037037037,
127
- # 'BM-PT3 1-shot': 25.925925925925924,
128
- # 'BM-PT3 3-shots': 31.48148148148148,
129
- # 'Tatabahasa 0-shot': 21.776504297994272,
130
- # 'Tatabahasa 1-shot': 21.776504297994272,
131
- # 'Tatabahasa 3-shots': 24.641833810888254,
132
- # },
133
- # {
134
- # 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
135
- # 'BM-PT3 0-shot': 20.37037037037037,
136
- # 'BM-PT3 1-shot': 24.074074074074073,
137
- # 'BM-PT3 3-shots': 33.33333333333333,
138
- # 'Tatabahasa 0-shot': 25.787965616045845,
139
- # 'Tatabahasa 1-shot': 27.507163323782237,
140
- # 'Tatabahasa 3-shots': 26.07449856733524,
141
- # }
142
- # ]
143
 
144
- # data = pd.DataFrame(close_source + open_source)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # demo = gr.Blocks(css=custom_css)
147
- # with demo:
148
- # gr.HTML(TITLE)
149
- # gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
150
- # gr.DataFrame(data, datatype = 'markdown')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # demo.launch()
153
 
154
- import gradio as gr
155
- demo = gr.Blocks()
156
  with demo:
157
- gr.HTML('helo')
 
 
 
158
  demo.launch()
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ from css_html_js import custom_css
 
 
4
 
5
+ TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
 
6
 
7
+ INTRODUCTION_TEXT = """
8
+ πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
 
10
+ ## Dataset
11
 
12
+ πŸ“ˆ We evaluate models based on 3 datasets,
 
 
 
 
 
 
 
 
13
 
14
+ 1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
15
+ - This test is for 15 years old Malaysia student, it is about reading comprehension and general knowledge for malay language.
16
+ 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
17
+ - This test is general test for malay grammar.
18
+ 3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
19
+ - This test is general test to language reasoning.
20
+ 4. HumanEval, https://github.com/openai/human-eval
21
+ - This test is for programming language understanding.
22
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ not_verify = [
25
+ {
26
+ 'model': 'Antrophic Claude 2',
27
+ 'Tatabahasa 0-shot': 61,
28
+ 'Tatabahasa 3-shots': 57.8,
29
+ },
30
+ {
31
+ 'model': 'Antrophic Claude 1',
32
+ 'Tatabahasa 3-shots': 67,
33
+ },
34
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ close_source = [
37
+ {
38
+ 'model': 'gpt-4-1106-preview',
39
+ 'BM-PT3 0-shot': 51.85185185185185,
40
+ 'BM-PT3 1-shot': 66.66666666666666,
41
+ 'BM-PT3 3-shots': 55.55555555555556,
42
+ 'Tatabahasa 0-shot': 75.64469914040114,
43
+ 'Tatabahasa 1-shot': 73.63896848137536,
44
+ 'Tatabahasa 3-shots': 75.64469914040114,
45
+ },
46
+ {
47
+ 'model': 'gpt-3.5-turbo-0613',
48
+ 'BM-PT3 0-shot': 36.53846153846153,
49
+ 'BM-PT3 1-shot': 28.846153846153843,
50
+ 'BM-PT3 3-shots': 24.528301886792452,
51
+ 'Tatabahasa 0-shot': 59.530791788856305,
52
+ 'Tatabahasa 1-shot': 60.80691642651297,
53
+ 'Tatabahasa 3-shots': 63.03724928366762,
54
+ },
55
+ ]
56
 
57
+ open_source = [
58
+ {
59
+ 'model': '[llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
60
+ 'Tatabahasa 0-shot': 24.355300859598856,
61
+ 'Tatabahasa 1-shot': 28.08022922636103,
62
+ 'Tatabahasa 3-shots': 24.641833810888254,
63
+ },
64
+ {
65
+ 'model': '[malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
66
+ 'BM-PT3 0-shot': 20.37037037037037,
67
+ 'BM-PT3 1-shot': 20.37037037037037,
68
+ 'BM-PT3 3-shots': 29.629629629629626,
69
+ 'Tatabahasa 0-shot': 17.765042979942695,
70
+ 'Tatabahasa 1-shot': 24.068767908309454,
71
+ 'Tatabahasa 3-shots': 27.507163323782237,
72
+ },
73
+ {
74
+ 'model': '[malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions)',
75
+ 'BM-PT3 0-shot': 35.294117647058826,
76
+ 'BM-PT3 1-shot': 21.153846153846153,
77
+ 'BM-PT3 3-shots': 28.30188679245283,
78
+ },
79
+ {
80
+ 'model': '[malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
81
+ 'BM-PT3 0-shot': 33.33333333333333,
82
+ 'BM-PT3 1-shot': 20.37037037037037,
83
+ 'BM-PT3 3-shots': 31.48148148148148,
84
+ 'Tatabahasa 0-shot': 26.07449856733524,
85
+ 'Tatabahasa 1-shot': 25.214899713467048,
86
+ 'Tatabahasa 3-shots': 24.355300859598856,
87
+ },
88
+ {
89
+ 'model': '[malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
90
+ 'BM-PT3 0-shot': 28.57142857142857,
91
+ 'BM-PT3 1-shot': 12.244897959183673,
92
+ 'BM-PT3 3-shots': 17.307692307692307,
93
+ },
94
+ {
95
+ 'model': '[mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
96
+ 'Tatabahasa 0-shot': 28.939828080229223,
97
+ 'Tatabahasa 1-shot': 34.38395415472779,
98
+ 'Tatabahasa 3-shots': 32.95128939828081,
99
+ },
100
+ {
101
+ 'model': '[malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
102
+ 'BM-PT3 0-shot': 20.37037037037037,
103
+ 'BM-PT3 1-shot': 22.22222222222222,
104
+ 'BM-PT3 3-shots': 33.33333333333333,
105
+ 'Tatabahasa 0-shot': 21.48997134670487,
106
+ 'Tatabahasa 1-shot': 28.939828080229223,
107
+ 'Tatabahasa 3-shots': 24.641833810888254,
108
+ },
109
+ {
110
+ 'model': '[malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
111
+ 'BM-PT3 0-shot': 16.666666666666664,
112
+ 'BM-PT3 1-shot': 16.666666666666664,
113
+ 'BM-PT3 3-shots': 25.925925925925924,
114
+ 'Tatabahasa 0-shot': 18.624641833810887,
115
+ 'Tatabahasa 1-shot': 24.355300859598856,
116
+ 'Tatabahasa 3-shots': 28.653295128939828,
117
+ },
118
+ {
119
+ 'model': '[malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
120
+ 'BM-PT3 0-shot': 35.18518518518518,
121
+ 'BM-PT3 1-shot': 33.33333333333333,
122
+ 'BM-PT3 3-shots': 37.03703703703704,
123
+ 'Tatabahasa 0-shot': 55.014326647564474,
124
+ 'Tatabahasa 1-shot': 42.693409742120345,
125
+ 'Tatabahasa 3-shots': 33.33333333333333,
126
+ },
127
+ {
128
+ 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
129
+ 'BM-PT3 0-shot': 20.37037037037037,
130
+ 'BM-PT3 1-shot': 25.925925925925924,
131
+ 'BM-PT3 3-shots': 31.48148148148148,
132
+ 'Tatabahasa 0-shot': 21.776504297994272,
133
+ 'Tatabahasa 1-shot': 21.776504297994272,
134
+ 'Tatabahasa 3-shots': 24.641833810888254,
135
+ },
136
+ {
137
+ 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
138
+ 'BM-PT3 0-shot': 20.37037037037037,
139
+ 'BM-PT3 1-shot': 24.074074074074073,
140
+ 'BM-PT3 3-shots': 33.33333333333333,
141
+ 'Tatabahasa 0-shot': 25.787965616045845,
142
+ 'Tatabahasa 1-shot': 27.507163323782237,
143
+ 'Tatabahasa 3-shots': 26.07449856733524,
144
+ }
145
+ ]
146
 
147
+ data = pd.DataFrame(close_source + open_source)
148
 
149
+ demo = gr.Blocks(css=custom_css)
 
150
  with demo:
151
+ gr.HTML(TITLE)
152
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
153
+ gr.DataFrame(data, datatype = 'markdown')
154
+
155
  demo.launch()