lukecq commited on
Commit
d31b48d
β€’
1 Parent(s): 0e06db3
Files changed (3) hide show
  1. app.py +73 -20
  2. src/display/about.py +10 -4
  3. src/leaderboard/load_results.py +2 -2
app.py CHANGED
@@ -33,15 +33,24 @@ snapshot_download(
33
  def restart_space():
34
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
35
 
 
 
36
  # Load the data from the csv file
37
  csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
38
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 
 
 
39
 
 
 
 
40
  # Searching and filtering
41
  def update_table(
42
  hidden_df: pd.DataFrame,
43
  # columns: list,
44
- # type_query: list,
 
45
  # precision_query: str,
46
  # size_query: list,
47
  # show_deleted: bool,
@@ -51,9 +60,16 @@ def update_table(
51
  # filtered_df = filter_queries(query, filtered_df)
52
  # df = select_columns(filtered_df, columns)
53
  filtered_df = hidden_df.copy()
54
- df = filter_queries(query, filtered_df)
 
 
 
 
 
55
  # deduplication
56
- df = df.drop_duplicates(subset=["Model"])
 
 
57
  return df
58
 
59
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
@@ -83,27 +99,44 @@ with demo:
83
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
84
  with gr.TabItem("πŸ… Overall", elem_id="llm-benchmark-Sum", id=0):
85
  with gr.Row():
86
- search_bar = gr.Textbox(
87
- placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
88
- show_label=False,
89
- elem_id="search-bar",
90
- )
 
 
 
 
 
 
 
 
 
 
 
91
 
92
  # with gr.Row():
93
- # shown_columns = gr.CheckboxGroup(
94
- # choices=["🟒 base", "πŸ”Ά chat"
95
- # ],
96
- # value=[
97
- # "base",
98
- # "chat",
99
- # ],
100
- # label="Select model types to show",
101
- # elem_id="column-select",
102
- # interactive=True,
103
- # )
 
 
 
 
 
104
 
105
  leaderboard_table = gr.components.Dataframe(
106
- value=df_avg,
 
107
  # value=leaderboard_df[
108
  # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
109
  # + shown_columns.value
@@ -114,6 +147,7 @@ with demo:
114
  elem_id="leaderboard-table",
115
  interactive=False,
116
  datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
 
117
  visible=True,
118
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
119
  )
@@ -131,6 +165,8 @@ with demo:
131
  # df_avg,
132
  hidden_leaderboard_table_for_search,
133
  # shown_columns,
 
 
134
  # filter_columns_type,
135
  # filter_columns_precision,
136
  # filter_columns_size,
@@ -139,6 +175,23 @@ with demo:
139
  ],
140
  leaderboard_table,
141
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
143
  with gr.Row():
144
  search_bar = gr.Textbox(
 
33
  def restart_space():
34
  API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
35
 
36
+ all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
37
+ show_columns = ['R','type', 'Model', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
38
  # Load the data from the csv file
39
  csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
40
  df_m3exam, df_mmlu, df_avg = load_data(csv_path)
41
+ df_m3exam = df_m3exam.copy()[show_columns]
42
+ df_mmlu = df_mmlu.copy()[show_columns]
43
+ df_avg_init = df_avg.copy()[df_avg['type'] == 'πŸ”Ά chat'][show_columns]
44
 
45
+ # data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
46
+ # map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
47
+ # map_types = {'rank': 'number', 'type': 'str', 'Model': 'markdown', 'open?': 'str', 'avg_sea': 'number', 'en': 'number', 'zh': 'number', 'id': 'number', 'th': 'number', 'vi': 'number', 'avg': 'number', 'params': 'number'}
48
  # Searching and filtering
49
  def update_table(
50
  hidden_df: pd.DataFrame,
51
  # columns: list,
52
+ type_query: list,
53
+ open_query: list,
54
  # precision_query: str,
55
  # size_query: list,
56
  # show_deleted: bool,
 
60
  # filtered_df = filter_queries(query, filtered_df)
61
  # df = select_columns(filtered_df, columns)
62
  filtered_df = hidden_df.copy()
63
+
64
+ filtered_df = filtered_df[filtered_df['type'].isin(type_query)]
65
+ map_open = {'open': 'Y', 'closed': 'N'}
66
+ filtered_df = filtered_df[filtered_df['open?'].isin([map_open[o] for o in open_query])]
67
+ filtered_df = filter_queries(query, filtered_df)
68
+ # filtered_df = filtered_df[[map_columns[k] for k in columns]]
69
  # deduplication
70
+ # df = df.drop_duplicates(subset=["Model"])
71
+ df = filtered_df.drop_duplicates()
72
+ df = df[show_columns]
73
  return df
74
 
75
  def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
 
99
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
100
  with gr.TabItem("πŸ… Overall", elem_id="llm-benchmark-Sum", id=0):
101
  with gr.Row():
102
+ with gr.Column():
103
+ with gr.Row():
104
+ search_bar = gr.Textbox(
105
+ placeholder=" πŸ” Search for your model (separate multiple queries with `;`) and press ENTER...",
106
+ show_label=False,
107
+ elem_id="search-bar",
108
+ )
109
+ # with gr.Row():
110
+ # with gr.Column():
111
+ # shown_columns = gr.CheckboxGroup(
112
+ # choices=["rank","type", "Model","open?", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
113
+ # value=["rank", "type", "Model", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
114
+ # label="Select model types to show",
115
+ # elem_id="column-select",
116
+ # interactive=True,
117
+ # )
118
 
119
  # with gr.Row():
120
+ with gr.Column():
121
+ type_query = gr.CheckboxGroup(
122
+ choices=["🟒 base", "πŸ”Ά chat"],
123
+ value=["πŸ”Ά chat" ],
124
+ label="model types to show",
125
+ elem_id="type-select",
126
+ interactive=True,
127
+ )
128
+ with gr.Column():
129
+ open_query = gr.CheckboxGroup(
130
+ choices=["open", "closed"],
131
+ value=["open", "closed"],
132
+ label="open-source or closed-source models?",
133
+ elem_id="open-select",
134
+ interactive=True,
135
+ )
136
 
137
  leaderboard_table = gr.components.Dataframe(
138
+ value=df_avg_init,
139
+ # [[map_columns[k] for k in shown_columns.value]],
140
  # value=leaderboard_df[
141
  # [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
142
  # + shown_columns.value
 
147
  elem_id="leaderboard-table",
148
  interactive=False,
149
  datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
150
+ # datatype=[map_types[k] for k in shown_columns.value],
151
  visible=True,
152
  # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
153
  )
 
165
  # df_avg,
166
  hidden_leaderboard_table_for_search,
167
  # shown_columns,
168
+ type_query,
169
+ open_query,
170
  # filter_columns_type,
171
  # filter_columns_precision,
172
  # filter_columns_size,
 
175
  ],
176
  leaderboard_table,
177
  )
178
+ for selector in [type_query, open_query]:
179
+ selector.change(
180
+ update_table,
181
+ [
182
+ # df_avg,
183
+ hidden_leaderboard_table_for_search,
184
+ # shown_columns,
185
+ type_query,
186
+ open_query,
187
+ # filter_columns_type,
188
+ # filter_columns_precision,
189
+ # filter_columns_size,
190
+ # deleted_models_visibility,
191
+ search_bar,
192
+ ],
193
+ leaderboard_table,
194
+ )
195
  with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
196
  with gr.Row():
197
  search_bar = gr.Textbox(
src/display/about.py CHANGED
@@ -23,18 +23,24 @@ SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Sout
23
 
24
  # What does your leaderboard evaluate?
25
  INTRODUCTION_TEXT = """
26
- This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
 
27
 
28
- For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "πŸ“ About" tab.
 
29
 
30
- Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
31
- """
 
 
32
 
33
  # Which evaluations are you running? how can people reproduce what you have?
34
  LLM_BENCHMARKS_TEXT = f"""
35
  # About
36
  Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
37
 
 
 
38
  ## Datasets
39
  The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
40
  - [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
 
23
 
24
  # What does your leaderboard evaluate?
25
  INTRODUCTION_TEXT = """
26
+ This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "πŸ“ About" tab for more information.
27
+ """
28
 
29
+ # INTRODUCTION_TEXT = """
30
+ # This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
31
 
32
+ # For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "πŸ“ About" tab.
33
+
34
+ # Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
35
+ # """
36
 
37
  # Which evaluations are you running? how can people reproduce what you have?
38
  LLM_BENCHMARKS_TEXT = f"""
39
  # About
40
  Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
41
 
42
+ Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
43
+
44
  ## Datasets
45
  The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
46
  - [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.
src/leaderboard/load_results.py CHANGED
@@ -28,7 +28,7 @@ def make_clickable_model(model_name, link=None):
28
  if len(model_name.split("/")) == 2:
29
  link = "https://huggingface.co/" + model_name
30
  return (
31
- f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
32
  )
33
  return model_name
34
 
@@ -36,7 +36,7 @@ def load_data(data_path):
36
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
37
 
38
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
39
- columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
40
 
41
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
42
  df_m3exam = df.iloc[:, :11] # M3Exam columns
 
28
  if len(model_name.split("/")) == 2:
29
  link = "https://huggingface.co/" + model_name
30
  return (
31
+ f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
32
  )
33
  return model_name
34
 
 
36
  df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
37
 
38
  columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
39
+ columns_sorted = ['R','type', 'Model','open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
40
 
41
  # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
42
  df_m3exam = df.iloc[:, :11] # M3Exam columns