huseinzol05 commited on
Commit
a38e13d
β€’
1 Parent(s): a551fbc

improve scores

Browse files
Files changed (1) hide show
  1. app.py +50 -21
app.py CHANGED
@@ -4,52 +4,81 @@ from css_html_js import custom_css
4
 
5
  demo = gr.Blocks(css=custom_css)
6
 
7
- TITLE = """<h1 align="center" id="space-title">πŸ€— Malay LLM Leaderboard</h1>"""
8
 
9
  INTRODUCTION_TEXT = """
10
- πŸ“ The πŸ€— Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
11
  πŸ€— All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
12
 
13
  ## Dataset
14
 
15
- πŸ“ˆ We evaluate models based on 4 datasets,
16
 
17
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
18
- 2. BM Paper 1, contains 180 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com-bm-kertas-1
19
- 3. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
20
- 4. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
21
  """
22
 
23
  data = [
24
  {
25
  'model': 'gpt-3.5-turbo-0613',
26
- 'BM-PT3 0-shot (% correct)': 36.53846153846153,
27
- 'BM-PT3 1-shot (% correct)': 28.846153846153843,
28
- 'BM-PT3 3-shots (% correct)': 24.528301886792452,
 
 
 
 
 
 
 
 
 
29
  },
30
  {
31
  'model': 'malaysian-llama2-7b-32k',
32
- 'BM-PT3 0-shot (% correct)': 20.37037037037037,
33
- 'BM-PT3 1-shot (% correct)': 16.666666666666664,
34
- 'BM-PT3 3-shots (% correct)': 27.77777777777778,
 
 
 
 
 
 
35
  },
36
  {
37
  'model': 'malaysian-llama2-13b-32k',
38
- 'BM-PT3 0-shot (% correct)': 33.33333333333333,
39
- 'BM-PT3 1-shot (% correct)': 24.074074074074073,
40
- 'BM-PT3 3-shots (% correct)': 25.925925925925924,
 
 
 
 
 
 
41
  },
42
  {
43
  'model': 'malaysian-mistral-7b-4k',
44
- 'BM-PT3 0-shot (% correct)': 20.37037037037037,
45
- 'BM-PT3 1-shot (% correct)': 25.925925925925924,
46
- 'BM-PT3 3-shots (% correct)': 29.629629629629626,
 
 
 
47
  },
48
  {
49
  'model': 'malaysian-mistral-7b-32k',
50
- 'BM-PT3 0-shot (% correct)': 16.666666666666664,
51
- 'BM-PT3 1-shot (% correct)': 22.22222222222222,
52
- 'BM-PT3 3-shots (% correct)': 14.814814814814813,
 
 
 
 
 
 
53
  }
54
  ]
55
 
 
4
 
5
  demo = gr.Blocks(css=custom_css)
6
 
7
+ TITLE = """<h1 align="center" id="space-title">πŸ‡²πŸ‡Ύ Malay LLM Leaderboard</h1>"""
8
 
9
  INTRODUCTION_TEXT = """
10
+ πŸ“ The πŸ‡²πŸ‡Ύ Malay LLM Leaderboard aims to track, rank and evaluate open LLMs on Malay tasks.\n
11
  πŸ€— All notebooks at https://github.com/mesolitica/llm-benchmarks, feel free to submit your own score at https://huggingface.co/spaces/mesolitica/malay-llm-leaderboard/discussions with link to the notebook.
12
 
13
  ## Dataset
14
 
15
+ πŸ“ˆ We evaluate models based on 3 datasets,
16
 
17
  1. BM-PT3 Paper 1, contains 54 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/BM-pt3
18
+ 2. Tatabahasa, contains 349 questions, https://github.com/mesolitica/malaysian-dataset/tree/master/llm-benchmark/tatabahasabm.tripod.com
19
+ 3. Translated IndoNLI to Malay, tested on `test_expert` dataset, https://huggingface.co/datasets/mesolitica/translated-indonli
 
20
  """
21
 
22
  data = [
23
  {
24
  'model': 'gpt-3.5-turbo-0613',
25
+ 'BM-PT3 0-shot': 36.53846153846153,
26
+ 'BM-PT3 1-shot': 28.846153846153843,
27
+ 'BM-PT3 3-shots': 24.528301886792452,
28
+ 'Tatabahasa 0-shot': 59.530791788856305,
29
+ 'Tatabahasa 1-shot': 60.80691642651297,
30
+ 'Tatabahasa 3-shots': 63.03724928366762,
31
+ },
32
+ {
33
+ 'model': 'gpt-4-1106-preview',
34
+ 'Tatabahasa 0-shot': 75.64469914040114,
35
+ 'Tatabahasa 1-shot': 73.63896848137536,
36
+ 'Tatabahasa 3-shots': 75.64469914040114,
37
  },
38
  {
39
  'model': 'malaysian-llama2-7b-32k',
40
+ 'BM-PT3 0-shot': 20.37037037037037,
41
+ 'BM-PT3 1-shot': 20.37037037037037,
42
+ 'BM-PT3 3-shots': 29.629629629629626,
43
+ },
44
+ {
45
+ 'model': 'malaysian-llama2-7b-32k-instructions',
46
+ 'BM-PT3 0-shot': 35.294117647058826,
47
+ 'BM-PT3 1-shot': 21.153846153846153,
48
+ 'BM-PT3 3-shots': 28.30188679245283,
49
  },
50
  {
51
  'model': 'malaysian-llama2-13b-32k',
52
+ 'BM-PT3 0-shot': 33.33333333333333,
53
+ 'BM-PT3 1-shot': 20.37037037037037,
54
+ 'BM-PT3 3-shots': 31.48148148148148,
55
+ },
56
+ {
57
+ 'model': 'malaysian-llama2-13b-32k-instructions',
58
+ 'BM-PT3 0-shot': 28.57142857142857,
59
+ 'BM-PT3 1-shot': 12.244897959183673,
60
+ 'BM-PT3 3-shots': 17.307692307692307,
61
  },
62
  {
63
  'model': 'malaysian-mistral-7b-4k',
64
+ 'BM-PT3 0-shot': 20.37037037037037,
65
+ 'BM-PT3 1-shot': 22.22222222222222,
66
+ 'BM-PT3 3-shots': 33.33333333333333,
67
+ 'Tatabahasa 0-shot': 21.48997134670487,
68
+ 'Tatabahasa 1-shot': 28.939828080229223,
69
+ 'Tatabahasa 3-shots': 24.641833810888254,
70
  },
71
  {
72
  'model': 'malaysian-mistral-7b-32k',
73
+ 'BM-PT3 0-shot': 16.666666666666664,
74
+ 'BM-PT3 1-shot': 16.666666666666664,
75
+ 'BM-PT3 3-shots': 25.925925925925924,
76
+ },
77
+ {
78
+ 'model': 'malaysian-mistral-7b-32k-instructions',
79
+ 'BM-PT3 0-shot': 21.568627450980394,
80
+ 'BM-PT3 1-shot': 31.25,
81
+ 'BM-PT3 3-shots': 28.000000000000004,
82
  }
83
  ]
84