kevinpro commited on
Commit
556657e
1 Parent(s): 8cc8064

commit message

Browse files
README.md CHANGED
@@ -1,12 +0,0 @@
1
- ---
2
- title: Open Multilingual Reasoning Leaderboard
3
- emoji: 💻
4
- colorFrom: pink
5
- colorTo: green
6
- sdk: gradio
7
- sdk_version: 4.21.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
__pycache__/content.cpython-38.pyc ADDED
Binary file (1.34 kB). View file
 
__pycache__/css.cpython-38.pyc ADDED
Binary file (416 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import glob
4
+ from collections import defaultdict
5
+ import pandas as pd
6
+ import gradio as gr
7
+ from content import *
8
+ from css import *
9
+ import glob
10
+
11
+
12
+
13
+ ARC = "arc"
14
+ HELLASWAG = "hellaswag"
15
+ MMLU = "mmlu"
16
+ TRUTHFULQA = "truthfulqa"
17
+ BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
18
+
19
+ METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
20
+
21
+ LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
22
+
23
+ LANG_NAME = {
24
+ 'ar': 'Arabic',
25
+ 'bn': 'Bengali',
26
+ 'ca': 'Catalan',
27
+ 'da': 'Danish',
28
+ 'de': 'German',
29
+ 'es': 'Spanish',
30
+ 'eu': 'Basque',
31
+ 'fr': 'French',
32
+ 'gu': 'Gujarati',
33
+ 'hi': 'Hindi',
34
+ 'hr': 'Croatian',
35
+ 'hu': 'Hungarian',
36
+ 'hy': 'Armenian',
37
+ 'id': 'Indonesian',
38
+ 'it': 'Italian',
39
+ 'kn': 'Kannada',
40
+ 'ml': 'Malayalam',
41
+ 'mr': 'Marathi',
42
+ 'ne': 'Nepali',
43
+ 'nl': 'Dutch',
44
+ 'pt': 'Portuguese',
45
+ 'ro': 'Romanian',
46
+ 'ru': 'Russian',
47
+ 'sk': 'Slovak',
48
+ 'sr': 'Serbian',
49
+ 'sv': 'Swedish',
50
+ 'ta': 'Tamil',
51
+ 'te': 'Telugu',
52
+ 'uk': 'Ukrainian',
53
+ 'vi': 'Vietnamese',
54
+ 'zh': 'Chinese'
55
+ }
56
+
57
+ MODEL_COL = "Model"
58
+ LANG_COL = "Language"
59
+ CODE_COL = "Code"
60
+ AVERAGE_COL = "Average"
61
+ ARC_COL = "ARC (25-shot)"
62
+
63
+ MGSM_COL = "MGSM"
64
+ MSVAMP_COL = "MSVAMP"
65
+ MNUM_COL = "MNumGLUESub"
66
+ HELLASWAG_COL = "HellaSwag (0-shot)️"
67
+ MMLU_COL = "MMLU (25-shot)"
68
+ TRUTHFULQA_COL = "TruthfulQA (0-shot)"
69
+ NOTES_COL = "Notes" # For search only
70
+
71
+ COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
72
+ TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
73
+
74
+
75
+
76
+ COLS = [MODEL_COL, MSVAMP_COL, MGSM_COL, MNUM_COL,NOTES_COL]
77
+ TYPES = ["str", "number", "number", "number","str"]
78
+
79
+
80
+
81
+ def get_leaderboard_df():
82
+ df = list()
83
+ results = [
84
+ ["GPT-3.5-Turbo", 46.6, 42.2, 49.4],
85
+ ["MAmmoTH", 26.3, 21.3, 24.2],
86
+ ["WizardMath", 32.5, 23.0, 28.7],
87
+ ["MetaMath", 46.2, 37.0, 43.2],
88
+ ["QAlign", 57.2, 49.6, 0],
89
+ ["MathOctopus", 41.2, 39.5, 37.1],
90
+ ["MathOctopus-MAPO-DPO(ours)🔥", 57.4, 41.6, 50.4],
91
+ ["MetaMathOctopus", 53.0, 45.5, 39.2],
92
+ ["MetaMathOctopus-MAPO-DPO(ours) 👑", 64.7, 51.6, 52.9],
93
+ ["MistralMathOctopus", 59.0, 58.0, 56.8],
94
+ ["MistralMathOctopus-MAPO-DPO(ours) 👑", 74.6, 67.3, 70.0]
95
+ ]
96
+ # for (pretrained, lang), perfs in performance_dict.items():
97
+ # lang_name = LANG_NAME[lang]
98
+ # arc_perf = perfs.get(ARC, 0.0)
99
+ # hellaswag_perf = perfs.get(HELLASWAG, 0.0)
100
+ # mmlu_perf = perfs.get(MMLU, 0.0)
101
+ # truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
102
+
103
+ # if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
104
+ # continue
105
+ # avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
106
+ # notes = ' '.join([pretrained, lang_name])
107
+ # row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
108
+ # df.append(row)
109
+ for i in results:
110
+ i.append("NOTE")
111
+ df = pd.DataFrame.from_records(results, columns=COLS)
112
+ df = df.sort_values(by=[ MSVAMP_COL], ascending=False)
113
+ df = df[COLS]
114
+
115
+ return df
116
+
117
+
118
+ def search_table(df, query):
119
+ filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
120
+ return filtered_df
121
+
122
+
123
+
124
+ original_df = get_leaderboard_df()
125
+
126
+ demo = gr.Blocks(css=CUSTOM_CSS)
127
+ with demo:
128
+ gr.HTML(TITLE)
129
+ gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
130
+ #gr.Markdown(HOW_TO, elem_classes="markdown-text")
131
+
132
+ with gr.Box():
133
+ search_bar = gr.Textbox(
134
+ placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
135
+ )
136
+
137
+ leaderboard_table = gr.components.Dataframe(
138
+ value=original_df,
139
+ headers=COLS,
140
+ datatype=TYPES,
141
+ max_rows=5,
142
+ elem_id="leaderboard-table",
143
+ )
144
+
145
+ # # Dummy leaderboard for handling the case when the user uses backspace key
146
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
147
+ value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
148
+ )
149
+
150
+ search_bar.change(
151
+ search_table,
152
+ [hidden_leaderboard_table_for_search, search_bar],
153
+ leaderboard_table,
154
+ )
155
+
156
+ with gr.Box():
157
+ search_bar = gr.Textbox(
158
+ placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
159
+ )
160
+
161
+ leaderboard_table = gr.components.Dataframe(
162
+ value=original_df,
163
+ headers=COLS,
164
+ datatype=TYPES,
165
+ max_rows=5,
166
+ elem_id="leaderboard-table",
167
+ )
168
+
169
+ # # Dummy leaderboard for handling the case when the user uses backspace key
170
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
171
+ value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
172
+ )
173
+
174
+ search_bar.change(
175
+ search_table,
176
+ [hidden_leaderboard_table_for_search, search_bar],
177
+ leaderboard_table,
178
+ )
179
+
180
+ #gr.Markdown(CREDIT, elem_classes="markdown-text")
181
+ gr.Markdown(CITATION, elem_classes="markdown-text")
182
+
183
+ demo.launch()
content.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TITLE = '<h1 align="center" id="space-title">Open Multilingual Reasoning Leaderboard</h1>'
2
+
3
+ INTRO_TEXT = f"""
4
+ ## About
5
+
6
+ This leaderboard tracks progress and ranks reasoning performance of large language models (LLMs) developed for different languages,
7
+ emphasizing on non-English languages to democratize benefits of LLMs to broader society.
8
+ Our current leaderboard provides evaluation data for 10 languages.
9
+ Both multilingual and language-specific LLMs are welcome in this leaderboard.
10
+ We currently evaluate models over four benchmarks:
11
+
12
+ - <a href="https://huggingface.co/datasets/Mathoctopus/MSVAMP" target="_blank"> MSVAMP </a>
13
+ - <a href="https://huggingface.co/datasets/juletxara/mgsm" target="_blank"> MGSM </a>
14
+ - <a href="https://arxiv.org/abs/2009.03300" target="_blank"> MNumGLUESub </a>
15
+
16
+ # """
17
+
18
+ # HOW_TO = f"""
19
+ # ## How to list your model performance on this leaderboard:
20
+
21
+ # Run the evaluation of your model using this repo: <a href="https://github.com/nlp-uoregon/mlmm-evaluation" target="_blank">https://github.com/nlp-uoregon/mlmm-evaluation</a>.
22
+
23
+ # And then, push the evaluation log and make a pull request.
24
+ # """
25
+
26
+ # CREDIT = f"""
27
+ # ## Credit
28
+
29
+ # To make this website, we use the following resources:
30
+
31
+ # - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
32
+ # - Funding and GPU access (Adobe Research)
33
+ # - Evaluation code (EleutherAI's lm_evaluation_harness repo)
34
+ # - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
35
+
36
+ # """
37
+
38
+
39
+ CITATION = f"""
40
+ ## Citation
41
+
42
+ ```
43
+ @misc{{she2024mapo,
44
+ title={{MAPO: Advancing Multilingual Reasoning through Multilingual Alignment-as-Preference Optimization}},
45
+ author={{Shuaijie She and Wei Zou and Shujian Huang and Wenhao Zhu and Xiang Liu and Xiang Geng and Jiajun Chen}},
46
+ year={{2024}},
47
+ eprint={{2401.06838}},
48
+ archivePrefix={{arXiv}},
49
+ primaryClass={{cs.CL}}
50
+ }}
51
+ ```
52
+ """
css.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CUSTOM_CSS = """
2
+ /* Hides the final column */
3
+ table td:last-child,
4
+ table th:last-child {
5
+ display: none;
6
+ }
7
+
8
+ /* 控制第一列的宽度 */
9
+ table td:first-child,
10
+ table th:first-child {
11
+ max-width: 200px;
12
+ overflow: auto;
13
+ white-space: nowrap;
14
+ }
15
+ """