Spaces:
Running
Running
update UI
Browse files- app.py +73 -20
- src/display/about.py +10 -4
- src/leaderboard/load_results.py +2 -2
app.py
CHANGED
@@ -33,15 +33,24 @@ snapshot_download(
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
35 |
|
|
|
|
|
36 |
# Load the data from the csv file
|
37 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
|
38 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
|
|
|
|
|
|
39 |
|
|
|
|
|
|
|
40 |
# Searching and filtering
|
41 |
def update_table(
|
42 |
hidden_df: pd.DataFrame,
|
43 |
# columns: list,
|
44 |
-
|
|
|
45 |
# precision_query: str,
|
46 |
# size_query: list,
|
47 |
# show_deleted: bool,
|
@@ -51,9 +60,16 @@ def update_table(
|
|
51 |
# filtered_df = filter_queries(query, filtered_df)
|
52 |
# df = select_columns(filtered_df, columns)
|
53 |
filtered_df = hidden_df.copy()
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
55 |
# deduplication
|
56 |
-
df = df.drop_duplicates(subset=["Model"])
|
|
|
|
|
57 |
return df
|
58 |
|
59 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
@@ -83,27 +99,44 @@ with demo:
|
|
83 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
84 |
with gr.TabItem("π
Overall", elem_id="llm-benchmark-Sum", id=0):
|
85 |
with gr.Row():
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
# with gr.Row():
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
leaderboard_table = gr.components.Dataframe(
|
106 |
-
value=
|
|
|
107 |
# value=leaderboard_df[
|
108 |
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
109 |
# + shown_columns.value
|
@@ -114,6 +147,7 @@ with demo:
|
|
114 |
elem_id="leaderboard-table",
|
115 |
interactive=False,
|
116 |
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
|
|
117 |
visible=True,
|
118 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
119 |
)
|
@@ -131,6 +165,8 @@ with demo:
|
|
131 |
# df_avg,
|
132 |
hidden_leaderboard_table_for_search,
|
133 |
# shown_columns,
|
|
|
|
|
134 |
# filter_columns_type,
|
135 |
# filter_columns_precision,
|
136 |
# filter_columns_size,
|
@@ -139,6 +175,23 @@ with demo:
|
|
139 |
],
|
140 |
leaderboard_table,
|
141 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
142 |
with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
|
143 |
with gr.Row():
|
144 |
search_bar = gr.Textbox(
|
|
|
33 |
def restart_space():
|
34 |
API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
|
35 |
|
36 |
+
all_columns = ['R','type', 'Model','open?', 'avg_sea β¬οΈ', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
|
37 |
+
show_columns = ['R','type', 'Model', 'avg_sea β¬οΈ', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
|
38 |
# Load the data from the csv file
|
39 |
csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
|
40 |
df_m3exam, df_mmlu, df_avg = load_data(csv_path)
|
41 |
+
df_m3exam = df_m3exam.copy()[show_columns]
|
42 |
+
df_mmlu = df_mmlu.copy()[show_columns]
|
43 |
+
df_avg_init = df_avg.copy()[df_avg['type'] == 'πΆ chat'][show_columns]
|
44 |
|
45 |
+
# data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
|
46 |
+
# map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea β¬οΈ', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
|
47 |
+
# map_types = {'rank': 'number', 'type': 'str', 'Model': 'markdown', 'open?': 'str', 'avg_sea': 'number', 'en': 'number', 'zh': 'number', 'id': 'number', 'th': 'number', 'vi': 'number', 'avg': 'number', 'params': 'number'}
|
48 |
# Searching and filtering
|
49 |
def update_table(
|
50 |
hidden_df: pd.DataFrame,
|
51 |
# columns: list,
|
52 |
+
type_query: list,
|
53 |
+
open_query: list,
|
54 |
# precision_query: str,
|
55 |
# size_query: list,
|
56 |
# show_deleted: bool,
|
|
|
60 |
# filtered_df = filter_queries(query, filtered_df)
|
61 |
# df = select_columns(filtered_df, columns)
|
62 |
filtered_df = hidden_df.copy()
|
63 |
+
|
64 |
+
filtered_df = filtered_df[filtered_df['type'].isin(type_query)]
|
65 |
+
map_open = {'open': 'Y', 'closed': 'N'}
|
66 |
+
filtered_df = filtered_df[filtered_df['open?'].isin([map_open[o] for o in open_query])]
|
67 |
+
filtered_df = filter_queries(query, filtered_df)
|
68 |
+
# filtered_df = filtered_df[[map_columns[k] for k in columns]]
|
69 |
# deduplication
|
70 |
+
# df = df.drop_duplicates(subset=["Model"])
|
71 |
+
df = filtered_df.drop_duplicates()
|
72 |
+
df = df[show_columns]
|
73 |
return df
|
74 |
|
75 |
def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
|
|
|
99 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
100 |
with gr.TabItem("π
Overall", elem_id="llm-benchmark-Sum", id=0):
|
101 |
with gr.Row():
|
102 |
+
with gr.Column():
|
103 |
+
with gr.Row():
|
104 |
+
search_bar = gr.Textbox(
|
105 |
+
placeholder=" π Search for your model (separate multiple queries with `;`) and press ENTER...",
|
106 |
+
show_label=False,
|
107 |
+
elem_id="search-bar",
|
108 |
+
)
|
109 |
+
# with gr.Row():
|
110 |
+
# with gr.Column():
|
111 |
+
# shown_columns = gr.CheckboxGroup(
|
112 |
+
# choices=["rank","type", "Model","open?", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
|
113 |
+
# value=["rank", "type", "Model", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
|
114 |
+
# label="Select model types to show",
|
115 |
+
# elem_id="column-select",
|
116 |
+
# interactive=True,
|
117 |
+
# )
|
118 |
|
119 |
# with gr.Row():
|
120 |
+
with gr.Column():
|
121 |
+
type_query = gr.CheckboxGroup(
|
122 |
+
choices=["π’ base", "πΆ chat"],
|
123 |
+
value=["πΆ chat" ],
|
124 |
+
label="model types to show",
|
125 |
+
elem_id="type-select",
|
126 |
+
interactive=True,
|
127 |
+
)
|
128 |
+
with gr.Column():
|
129 |
+
open_query = gr.CheckboxGroup(
|
130 |
+
choices=["open", "closed"],
|
131 |
+
value=["open", "closed"],
|
132 |
+
label="open-source or closed-source models?",
|
133 |
+
elem_id="open-select",
|
134 |
+
interactive=True,
|
135 |
+
)
|
136 |
|
137 |
leaderboard_table = gr.components.Dataframe(
|
138 |
+
value=df_avg_init,
|
139 |
+
# [[map_columns[k] for k in shown_columns.value]],
|
140 |
# value=leaderboard_df[
|
141 |
# [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
|
142 |
# + shown_columns.value
|
|
|
147 |
elem_id="leaderboard-table",
|
148 |
interactive=False,
|
149 |
datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
|
150 |
+
# datatype=[map_types[k] for k in shown_columns.value],
|
151 |
visible=True,
|
152 |
# column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
|
153 |
)
|
|
|
165 |
# df_avg,
|
166 |
hidden_leaderboard_table_for_search,
|
167 |
# shown_columns,
|
168 |
+
type_query,
|
169 |
+
open_query,
|
170 |
# filter_columns_type,
|
171 |
# filter_columns_precision,
|
172 |
# filter_columns_size,
|
|
|
175 |
],
|
176 |
leaderboard_table,
|
177 |
)
|
178 |
+
for selector in [type_query, open_query]:
|
179 |
+
selector.change(
|
180 |
+
update_table,
|
181 |
+
[
|
182 |
+
# df_avg,
|
183 |
+
hidden_leaderboard_table_for_search,
|
184 |
+
# shown_columns,
|
185 |
+
type_query,
|
186 |
+
open_query,
|
187 |
+
# filter_columns_type,
|
188 |
+
# filter_columns_precision,
|
189 |
+
# filter_columns_size,
|
190 |
+
# deleted_models_visibility,
|
191 |
+
search_bar,
|
192 |
+
],
|
193 |
+
leaderboard_table,
|
194 |
+
)
|
195 |
with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
|
196 |
with gr.Row():
|
197 |
search_bar = gr.Textbox(
|
src/display/about.py
CHANGED
@@ -23,18 +23,24 @@ SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Sout
|
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
-
This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages.
|
|
|
27 |
|
28 |
-
|
|
|
29 |
|
30 |
-
|
31 |
-
|
|
|
|
|
32 |
|
33 |
# Which evaluations are you running? how can people reproduce what you have?
|
34 |
LLM_BENCHMARKS_TEXT = f"""
|
35 |
# About
|
36 |
Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
|
37 |
|
|
|
|
|
38 |
## Datasets
|
39 |
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
|
40 |
- [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
|
|
|
23 |
|
24 |
# What does your leaderboard evaluate?
|
25 |
INTRODUCTION_TEXT = """
|
26 |
+
This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "π About" tab for more information.
|
27 |
+
"""
|
28 |
|
29 |
+
# INTRODUCTION_TEXT = """
|
30 |
+
# This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
|
31 |
|
32 |
+
# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "π About" tab.
|
33 |
+
|
34 |
+
# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
35 |
+
# """
|
36 |
|
37 |
# Which evaluations are you running? how can people reproduce what you have?
|
38 |
LLM_BENCHMARKS_TEXT = f"""
|
39 |
# About
|
40 |
Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
|
41 |
|
42 |
+
Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
|
43 |
+
|
44 |
## Datasets
|
45 |
The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
|
46 |
- [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
|
src/leaderboard/load_results.py
CHANGED
@@ -28,7 +28,7 @@ def make_clickable_model(model_name, link=None):
|
|
28 |
if len(model_name.split("/")) == 2:
|
29 |
link = "https://huggingface.co/" + model_name
|
30 |
return (
|
31 |
-
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
|
32 |
)
|
33 |
return model_name
|
34 |
|
@@ -36,7 +36,7 @@ def load_data(data_path):
|
|
36 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
37 |
|
38 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
39 |
-
columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
|
40 |
|
41 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
42 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|
|
|
28 |
if len(model_name.split("/")) == 2:
|
29 |
link = "https://huggingface.co/" + model_name
|
30 |
return (
|
31 |
+
f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
|
32 |
)
|
33 |
return model_name
|
34 |
|
|
|
36 |
df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
|
37 |
|
38 |
columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
|
39 |
+
columns_sorted = ['R','type', 'Model','open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
|
40 |
|
41 |
# Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
|
42 |
df_m3exam = df.iloc[:, :11] # M3Exam columns
|