panuthept commited on
Commit
cf7ae80
β€’
1 Parent(s): 9957df6

update results

Browse files
Files changed (1) hide show
  1. app.py +21 -262
app.py CHANGED
@@ -2,10 +2,10 @@ import gradio as gr
2
  import pandas as pd
3
  from css_html_js import custom_css
4
 
5
- TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
6
 
7
  INTRODUCTION_TEXT = """
8
- πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
  ## Dataset
10
  πŸ“ˆ We evaluate models based on 3 datasets,
11
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
@@ -20,275 +20,34 @@ INTRODUCTION_TEXT = """
20
  1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani
21
  2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102
22
  ## Tagging
23
- 🟒 pretrained β­• instruction-tuned πŸ“¦ close sourced
24
  """
25
 
26
- close_source = [
27
- {
28
- 'T': 'πŸ“¦',
29
- 'model': 'claude-3-opus-20240229',
30
- 'BM-PT3 0-shot': 57.41,
31
- 'BM-PT3 1-shot': 53.70,
32
- 'BM-PT3 3-shots': 62.96,
33
- 'Tatabahasa 0-shot': 77.08,
34
- 'Tatabahasa 1-shot': 73.93,
35
- 'Tatabahasa 3-shots': 75.64,
36
- },
37
- {
38
- 'T': 'πŸ“¦',
39
- 'model': 'claude-3-sonnet-20240229',
40
- 'BM-PT3 0-shot': 48.15,
41
- 'BM-PT3 1-shot': 50.00,
42
- 'BM-PT3 3-shots': 37.04,
43
- 'Tatabahasa 0-shot': 65.90,
44
- 'Tatabahasa 1-shot': 38.40,
45
- 'Tatabahasa 3-shots': 40.97,
46
- },
47
- {
48
- 'T': 'πŸ“¦',
49
- 'model': 'claude-3-haiku-20240307',
50
- 'BM-PT3 0-shot': 48.15,
51
- 'BM-PT3 1-shot': 50.00,
52
- 'BM-PT3 3-shots': 50.00,
53
- 'Tatabahasa 0-shot': 62.75,
54
- 'Tatabahasa 1-shot': 49.86,
55
- 'Tatabahasa 3-shots': 24.07,
56
- },
57
- {
58
- 'T': 'πŸ“¦',
59
- 'model': 'AWS Bedrock Claude 1.3',
60
- 'Tatabahasa 0-shot': 60.650887573964496,
61
- 'Tatabahasa 1-shot': 62.46418338108882,
62
- 'Tatabahasa 3-shots': 67.34104046242774,
63
- },
64
- {
65
- 'T': 'πŸ“¦',
66
- 'model': 'AWS Bedrock Claude 2',
67
- 'Tatabahasa 0-shot': 61.702127659574465,
68
- 'Tatabahasa 1-shot': 60.17191977077364,
69
- 'Tatabahasa 3-shots': 59.598853868194844,
70
- },
71
- {
72
- 'T': 'πŸ“¦',
73
- 'model': 'gpt-4-1106-preview',
74
- 'BM-PT3 0-shot': 51.85185185185185,
75
- 'BM-PT3 1-shot': 66.66666666666666,
76
- 'BM-PT3 3-shots': 55.55555555555556,
77
- 'Tatabahasa 0-shot': 75.64469914040114,
78
- 'Tatabahasa 1-shot': 73.63896848137536,
79
- 'Tatabahasa 3-shots': 75.64469914040114,
80
- },
81
- {
82
- 'T': 'πŸ“¦',
83
- 'model': 'gpt-3.5-turbo-0613',
84
- 'BM-PT3 0-shot': 36.53846153846153,
85
- 'BM-PT3 1-shot': 28.846153846153843,
86
- 'BM-PT3 3-shots': 24.528301886792452,
87
- 'Tatabahasa 0-shot': 59.530791788856305,
88
- 'Tatabahasa 1-shot': 60.80691642651297,
89
- 'Tatabahasa 3-shots': 63.03724928366762,
90
- },
91
- ]
92
-
93
- open_source = [
94
  {
95
  'T': '🟒',
96
- 'model': '[meta-llama/llama2-7b](https://huggingface.co/meta-llama/Llama-2-7b-hf)',
97
- 'Tatabahasa 0-shot': 24.355300859598856,
98
- 'Tatabahasa 1-shot': 28.08022922636103,
99
- 'Tatabahasa 3-shots': 24.641833810888254,
 
 
 
 
100
  },
101
  {
102
- 'T': '🟒',
103
- 'model': '[mesolitica/tinyllama-1.1b-4096-fpf](https://huggingface.co/mesolitica/tinyllama-1.1b-4096-fpf)',
104
- 'Tatabahasa 0-shot': 23.248407643312103,
105
- 'Tatabahasa 1-shot': 27.22063037249284,
106
- 'Tatabahasa 3-shots': 24.355300859598856,
107
- },
108
- {
109
- 'T': '🟒',
110
- 'model': '[mesolitica/malaysian-llama2-7b-32k](https://huggingface.co/mesolitica/llama-7b-hf-32768-fpf)',
111
- 'BM-PT3 0-shot': 20.37037037037037,
112
- 'BM-PT3 1-shot': 20.37037037037037,
113
- 'BM-PT3 3-shots': 29.629629629629626,
114
- 'Tatabahasa 0-shot': 17.765042979942695,
115
- 'Tatabahasa 1-shot': 24.068767908309454,
116
- 'Tatabahasa 3-shots': 27.507163323782237,
117
- },
118
- {
119
- 'T': 'β­•',
120
- 'model': '[mesolitica/malaysian-llama2-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-7b-32k-instructions-v2)',
121
- 'BM-PT3 0-shot': 33.33333333333333,
122
- 'BM-PT3 1-shot': 37.03703703703704,
123
- 'BM-PT3 3-shots': 35.18518518518518,
124
- 'Tatabahasa 0-shot': 59.31232091690545,
125
- 'Tatabahasa 1-shot': 53.86819484240688,
126
- 'Tatabahasa 3-shots': 45.55873925501432,
127
- },
128
- {
129
- 'T': '🟒',
130
- 'model': '[mesolitica/malaysian-llama2-13b-32k](https://huggingface.co/mesolitica/llama-13b-hf-32768-fpf)',
131
- 'BM-PT3 0-shot': 33.33333333333333,
132
- 'BM-PT3 1-shot': 20.37037037037037,
133
- 'BM-PT3 3-shots': 31.48148148148148,
134
- 'Tatabahasa 0-shot': 26.07449856733524,
135
- 'Tatabahasa 1-shot': 25.214899713467048,
136
- 'Tatabahasa 3-shots': 24.355300859598856,
137
- },
138
- {
139
- 'T': 'β­•',
140
- 'model': '[mistralai/malaysian-llama2-13b-32k-instructions](https://huggingface.co/mesolitica/malaysian-llama2-13b-32k-instructions)',
141
- 'BM-PT3 0-shot': 28.57142857142857,
142
- 'BM-PT3 1-shot': 12.244897959183673,
143
- 'BM-PT3 3-shots': 17.307692307692307,
144
- },
145
- {
146
- 'T': '🟒',
147
- 'model': '[mistralai/mistral-7b](https://huggingface.co/mistralai/Mistral-7B-v0.1)',
148
- 'Tatabahasa 0-shot': 28.939828080229223,
149
- 'Tatabahasa 1-shot': 34.38395415472779,
150
- 'Tatabahasa 3-shots': 32.95128939828081,
151
- },
152
- {
153
- 'T': '🟒',
154
- 'model': '[mesolitica/malaysian-mistral-7b-4k](https://huggingface.co/mesolitica/mistral-7b-4096-fpf)',
155
- 'BM-PT3 0-shot': 20.37037037037037,
156
- 'BM-PT3 1-shot': 22.22222222222222,
157
- 'BM-PT3 3-shots': 33.33333333333333,
158
- 'Tatabahasa 0-shot': 21.48997134670487,
159
- 'Tatabahasa 1-shot': 28.939828080229223,
160
- 'Tatabahasa 3-shots': 24.641833810888254,
161
- },
162
- {
163
- 'T': '🟒',
164
- 'model': '[mesolitica/malaysian-mistral-7b-32k](https://huggingface.co/mesolitica/mistral-7b-32768-fpf)',
165
- 'BM-PT3 0-shot': 16.666666666666664,
166
- 'BM-PT3 1-shot': 16.666666666666664,
167
- 'BM-PT3 3-shots': 25.925925925925924,
168
- 'Tatabahasa 0-shot': 18.624641833810887,
169
- 'Tatabahasa 1-shot': 24.355300859598856,
170
- 'Tatabahasa 3-shots': 28.653295128939828,
171
- },
172
- {
173
- 'T': 'β­•',
174
- 'model': '[mesolitica/malaysian-mistral-7b-32k-instructions](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
175
- 'BM-PT3 0-shot': 40.74074074074074,
176
- 'BM-PT3 1-shot': 33.33333333333333,
177
- 'BM-PT3 3-shots': 37.03703703703704,
178
- 'Tatabahasa 0-shot': 65.32951289398281,
179
- 'Tatabahasa 1-shot': 57.306590257879655,
180
- 'Tatabahasa 3-shots': 56.446991404011456,
181
- },
182
- {
183
- 'T': 'β­•',
184
- 'model': '[mesolitica/malaysian-mistral-7b-32k-instructions-v4](https://huggingface.co/mesolitica/malaysian-mistral-7b-32k-instructions)',
185
- 'BM-PT3 0-shot': 35.18518518518518,
186
- 'BM-PT3 1-shot': 31.48148148148148,
187
- 'BM-PT3 3-shots': 33.33333333333333,
188
- 'Tatabahasa 0-shot': 66.4756446991404,
189
- 'Tatabahasa 1-shot': 54.15472779369628,
190
- 'Tatabahasa 3-shots': 49.8567335243553,
191
- },
192
- {
193
- 'T': '🟒',
194
- 'model': '[aisingapore/sealion3b](https://huggingface.co/aisingapore/sealion3b)',
195
- 'BM-PT3 0-shot': 20.37037037037037,
196
- 'BM-PT3 1-shot': 25.925925925925924,
197
- 'BM-PT3 3-shots': 31.48148148148148,
198
- 'Tatabahasa 0-shot': 21.776504297994272,
199
- 'Tatabahasa 1-shot': 21.776504297994272,
200
- 'Tatabahasa 3-shots': 24.641833810888254,
201
- },
202
- {
203
- 'T': '🟒',
204
- 'model': '[aisingapore/sealion7b](https://huggingface.co/aisingapore/sealion7b)',
205
- 'BM-PT3 0-shot': 20.37037037037037,
206
- 'BM-PT3 1-shot': 24.074074074074073,
207
- 'BM-PT3 3-shots': 33.33333333333333,
208
- 'Tatabahasa 0-shot': 25.787965616045845,
209
- 'Tatabahasa 1-shot': 27.507163323782237,
210
- 'Tatabahasa 3-shots': 26.07449856733524,
211
- },
212
- {
213
- 'T': '🟒',
214
- 'model': '[mesolitica/mallam-1.1B-4096](https://huggingface.co/mesolitica/mallam-1.1B-4096)',
215
- 'Tatabahasa 0-shot': 25.757575757575758,
216
- 'Tatabahasa 1-shot': 25.787965616045845,
217
- 'Tatabahasa 3-shots': 28.08022922636103,
218
- },
219
- {
220
- 'T': '🟒',
221
- 'model': '[mesolitica/mallam-3B-4096](https://huggingface.co/mesolitica/mallam-3B-4096)',
222
- 'Tatabahasa 0-shot': 24.567474048442904,
223
- 'Tatabahasa 1-shot': 24.641833810888254,
224
- 'Tatabahasa 3-shots': 28.653295128939828,
225
- },
226
- {
227
- 'T': '🟒',
228
- 'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
229
- 'Tatabahasa 0-shot': 24.074074074074073,
230
- 'Tatabahasa 1-shot': 27.793696275071632,
231
- 'Tatabahasa 3-shots': 28.653295128939828,
232
- },
233
- {
234
- 'T': '🟒',
235
- 'model': '[sail/Sailor-0.5B](https://huggingface.co/sail/Sailor-0.5B)',
236
- 'Tatabahasa 0-shot': 17.191977077363894,
237
- 'Tatabahasa 1-shot': 23.78223495702006,
238
- 'Tatabahasa 3-shots': 25.501432664756447,
239
- },
240
- {
241
- 'T': '🟒',
242
- 'model': '[sail/Sailor-1.8B](https://huggingface.co/sail/Sailor-1.8B)',
243
- 'Tatabahasa 0-shot': 29.512893982808023,
244
- 'Tatabahasa 1-shot': 27.507163323782237,
245
- 'Tatabahasa 3-shots': 24.92836676217765,
246
- },
247
- {
248
- 'T': '🟒',
249
- 'model': '[sail/Sailor-4B](https://huggingface.co/sail/Sailor-4B)',
250
- 'Tatabahasa 0-shot': 31.51862464183381,
251
- 'Tatabahasa 1-shot': 36.10315186246418,
252
- 'Tatabahasa 3-shots': 27.507163323782237,
253
- },
254
- {
255
- 'T': '🟒',
256
- 'model': '[sail/Sailor-7B](https://huggingface.co/sail/Sailor-7B)',
257
- 'Tatabahasa 0-shot': 55.30085959885387,
258
- 'Tatabahasa 1-shot': 54.72779369627507,
259
- 'Tatabahasa 3-shots': 59.02578796561605,
260
- },
261
- {
262
- 'T': '🟒',
263
- 'model': '[mesolitica/mallam-5B-4096](https://huggingface.co/mesolitica/mallam-5B-4096)',
264
- 'Tatabahasa 0-shot': 24.074074074074073,
265
- 'Tatabahasa 1-shot': 27.793696275071632,
266
- 'Tatabahasa 3-shots': 28.653295128939828,
267
- },
268
- {
269
- 'T': '🟒',
270
- 'model': '[mesolitica/gemma-2B-8192-fpf](https://huggingface.co/mesolitica/gemma-2B-8192-fpf)',
271
- 'Tatabahasa 0-shot': 14.613180515759314,
272
- 'Tatabahasa 1-shot': 25.501432664756447,
273
- 'Tatabahasa 3-shots': 23.49570200573066,
274
- },
275
- {
276
- 'T': '🟒',
277
- 'model': '[mesolitica/Qwen1.5-0.5B-4096-fpf](https://huggingface.co/mesolitica/Qwen1.5-0.5B-4096-fpf)',
278
- 'Tatabahasa 0-shot': 13.753581661891118,
279
- 'Tatabahasa 1-shot': 21.20343839541547,
280
- 'Tatabahasa 3-shots': 22.636103151862464,
281
- },
282
- {
283
- 'T': 'β­•',
284
- 'model': '[mesolitica/mallam-1.1b-20k-instructions](https://huggingface.co/mesolitica/mallam-1.1b-20k-instructions)',
285
- 'Tatabahasa 0-shot': 26.923076923076923,
286
- 'Tatabahasa 1-shot': 28.939828080229223,
287
- 'Tatabahasa 3-shots': 21.776504297994272,
288
  },
289
  ]
290
 
291
- data = pd.DataFrame(close_source + open_source)
292
 
293
  demo = gr.Blocks(css=custom_css)
294
  with demo:
 
2
  import pandas as pd
3
  from css_html_js import custom_css
4
 
5
+ TITLE = """<h1 align="center" id="space-title">πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard</h1>"""
6
 
7
  INTRODUCTION_TEXT = """
8
+ πŸ“ The πŸ‡ΉπŸ‡­ Thai Sentence Embedding Leaderboard aims to track, rank and evaluate open embedding models on Thai sentence embedding tasks. All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
9
  ## Dataset
10
  πŸ“ˆ We evaluate models based on 3 datasets,
11
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
 
20
  1. Claude 1.3 and 2.0 Tatabahasa contributed by https://www.linkedin.com/in/fahim-surani
21
  2. Claude 3.0 contributed by https://github.com/theblackcat102, https://huggingface.co/theblackcat102
22
  ## Tagging
23
+ 🟒 Non-LLM β­• LLM πŸ“¦ API
24
  """
25
 
26
+ results = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
  {
28
  'T': '🟒',
29
+ 'model': '[BAAI/bge-m3](https://huggingface.co/BAAI/bge-m3)',
30
+ 'Model Size (Million Parameters)': 570,
31
+ 'Embedding Dimensions': 1024,
32
+ 'Average (8 datasets)': 75.64,
33
+ 'STS Average (1 datasets)': 77.22,
34
+ 'Classification (3 datasets)': 59.95,
35
+ 'PairClassification (1 datasets)': 79.02,
36
+ 'Retrieval (3 datasets)': 91.42,
37
  },
38
  {
39
+ 'T': 'πŸ“¦',
40
+ 'model': 'Cohere-embed-multilingual-v3.0',
41
+ 'Embedding Dimensions': 1024,
42
+ 'Average (8 datasets)': 74.86,
43
+ 'STS Average (1 datasets)': 77.87,
44
+ 'Classification (3 datasets)': 59.96,
45
+ 'PairClassification (1 datasets)': 73.28,
46
+ 'Retrieval (3 datasets)': 91.43,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  },
48
  ]
49
 
50
+ data = pd.DataFrame(results)
51
 
52
  demo = gr.Blocks(css=custom_css)
53
  with demo: