Spaces:

SeaLLMs
/

SeaExam_leaderboard

Running

App Files Files Community

lukecq commited on Apr 25

Commit

d31b48d

•

1 Parent(s): 0e06db3

update UI

Browse files

Files changed (3) hide show

app.py +73 -20
src/display/about.py +10 -4
src/leaderboard/load_results.py +2 -2

app.py CHANGED Viewed

@@ -33,15 +33,24 @@ snapshot_download(
 def restart_space():
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
 # Load the data from the csv file
 csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
 df_m3exam, df_mmlu, df_avg = load_data(csv_path)
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
     # columns: list,
-    # type_query: list,
     # precision_query: str,
     # size_query: list,
     # show_deleted: bool,
@@ -51,9 +60,16 @@ def update_table(
     # filtered_df = filter_queries(query, filtered_df)
     # df = select_columns(filtered_df, columns)
     filtered_df = hidden_df.copy()
-    df = filter_queries(query, filtered_df)
     # deduplication
-    df = df.drop_duplicates(subset=["Model"])
     return df
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
@@ -83,27 +99,44 @@ with demo:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 Overall", elem_id="llm-benchmark-Sum", id=0):
             with gr.Row():
-                search_bar = gr.Textbox(
-                    placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
-                    show_label=False,
-                    elem_id="search-bar",
-                )
             # with gr.Row():
-            #     shown_columns = gr.CheckboxGroup(
-            #         choices=["🟢 base", "🔶 chat"
-            #         ],
-            #         value=[
-            #             "base",
-            #             "chat",
-            #         ],
-            #         label="Select model types to show",
-            #         elem_id="column-select",
-            #         interactive=True,
-            #     )
             leaderboard_table = gr.components.Dataframe(
-                value=df_avg,
                 # value=leaderboard_df[
                 #     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
                 #     + shown_columns.value
@@ -114,6 +147,7 @@ with demo:
                 elem_id="leaderboard-table",
                 interactive=False,
                 datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
                 visible=True,
                 # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
             )
@@ -131,6 +165,8 @@ with demo:
                     # df_avg,
                     hidden_leaderboard_table_for_search,
                     # shown_columns,
                     # filter_columns_type,
                     # filter_columns_precision,
                     # filter_columns_size,
@@ -139,6 +175,23 @@ with demo:
                 ],
                 leaderboard_table,
             )
         with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
             with gr.Row():
                 search_bar = gr.Textbox(

 def restart_space():
     API.restart_space(repo_id="SeaLLMs/SeaExam_leaderboard", token=TOKEN)
+all_columns = ['R','type', 'Model','open?', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
+show_columns = ['R','type', 'Model', 'avg_sea ⬇️', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'params(B)']
 # Load the data from the csv file
 csv_path = f'{EVAL_RESULTS_PATH}/SeaExam_results_20240425.csv'
 df_m3exam, df_mmlu, df_avg = load_data(csv_path)
+df_m3exam = df_m3exam.copy()[show_columns]
+df_mmlu = df_mmlu.copy()[show_columns]
+df_avg_init = df_avg.copy()[df_avg['type'] == '🔶 chat'][show_columns]
+# data_types = ['number', 'str', 'markdown','str', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']
+# map_columns = {'rank':'R','type':'type', 'Model':'Model','open?':'open?', 'avg_sea':'avg_sea ⬇️', 'en':'en', 'zh':'zh', 'id':'id', 'th':'th', 'vi':'vi', 'avg':'avg', 'params':'params(B)'}
+# map_types = {'rank': 'number', 'type': 'str', 'Model': 'markdown', 'open?': 'str', 'avg_sea': 'number', 'en': 'number', 'zh': 'number', 'id': 'number', 'th': 'number', 'vi': 'number', 'avg': 'number', 'params': 'number'}
 # Searching and filtering
 def update_table(
     hidden_df: pd.DataFrame,
     # columns: list,
+    type_query: list,
+    open_query: list,
     # precision_query: str,
     # size_query: list,
     # show_deleted: bool,
     # filtered_df = filter_queries(query, filtered_df)
     # df = select_columns(filtered_df, columns)
     filtered_df = hidden_df.copy()
+    filtered_df = filtered_df[filtered_df['type'].isin(type_query)]
+    map_open = {'open': 'Y', 'closed': 'N'}
+    filtered_df = filtered_df[filtered_df['open?'].isin([map_open[o] for o in open_query])]
+    filtered_df = filter_queries(query, filtered_df)
+    # filtered_df = filtered_df[[map_columns[k] for k in columns]]
     # deduplication
+    # df = df.drop_duplicates(subset=["Model"])
+    df = filtered_df.drop_duplicates()
+    df = df[show_columns]
     return df
 def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame:
     with gr.Tabs(elem_classes="tab-buttons") as tabs:
         with gr.TabItem("🏅 Overall", elem_id="llm-benchmark-Sum", id=0):
             with gr.Row():
+                with gr.Column():
+                    with gr.Row():
+                        search_bar = gr.Textbox(
+                            placeholder=" 🔍 Search for your model (separate multiple queries with `;`) and press ENTER...",
+                            show_label=False,
+                            elem_id="search-bar",
+                        )
+                    # with gr.Row():
+                    #     with gr.Column():
+                    #         shown_columns = gr.CheckboxGroup(
+                    #             choices=["rank","type", "Model","open?", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
+                    #             value=["rank", "type", "Model", "avg_sea", "en", "zh", "id", "th", "vi", "avg", "params"],
+                    #             label="Select model types to show",
+                    #             elem_id="column-select",
+                    #             interactive=True,
+                    #         )
             # with gr.Row():
+                with gr.Column():
+                    type_query = gr.CheckboxGroup(
+                        choices=["🟢 base", "🔶 chat"],
+                        value=["🔶 chat" ],
+                        label="model types to show",
+                        elem_id="type-select",
+                        interactive=True,
+                    )
+                with gr.Column():
+                    open_query = gr.CheckboxGroup(
+                        choices=["open", "closed"],
+                        value=["open", "closed"],
+                        label="open-source or closed-source models?",
+                        elem_id="open-select",
+                        interactive=True,
+                    )
             leaderboard_table = gr.components.Dataframe(
+                value=df_avg_init,
+                # [[map_columns[k] for k in shown_columns.value]],
                 # value=leaderboard_df[
                 #     [c.name for c in fields(AutoEvalColumn) if c.never_hidden]
                 #     + shown_columns.value
                 elem_id="leaderboard-table",
                 interactive=False,
                 datatype=['number', 'str', 'markdown', 'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number'],
+                # datatype=[map_types[k] for k in shown_columns.value],
                 visible=True,
                 # column_widths=["20%", "6%", "8%", "6%", "8%", "8%", "6%", "6%", "6%", "6%", "6%"],
             )
                     # df_avg,
                     hidden_leaderboard_table_for_search,
                     # shown_columns,
+                    type_query,
+                    open_query,
                     # filter_columns_type,
                     # filter_columns_precision,
                     # filter_columns_size,
                 ],
                 leaderboard_table,
             )
+            for selector in [type_query, open_query]:
+                selector.change(
+                    update_table,
+                    [
+                        # df_avg,
+                        hidden_leaderboard_table_for_search,
+                        # shown_columns,
+                        type_query,
+                        open_query,
+                        # filter_columns_type,
+                        # filter_columns_precision,
+                        # filter_columns_size,
+                        # deleted_models_visibility,
+                        search_bar,
+                    ],
+                    leaderboard_table,
+                )
         with gr.TabItem("M3Exam", elem_id="llm-benchmark-M3Exam", id=1):
             with gr.Row():
                 search_bar = gr.Textbox(

src/display/about.py CHANGED Viewed

@@ -23,18 +23,24 @@ SUB_TITLE = """<h2 align="center" id="space-title">What is the best LLM for Sout
 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
-This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
-For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
-Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
-"""
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 # About
 Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
 ## Datasets
 The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
 - [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.

 # What does your leaderboard evaluate?
 INTRODUCTION_TEXT = """
+This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. Refer to the "📝 About" tab for more information.
+"""
+# INTRODUCTION_TEXT = """
+# This leaderboard is specifically designed to evaluate large language models (LLMs) for Southeast Asian (SEA) languages. It assesses model performance using human-exam type benchmarks, reflecting the model's world knowledge (e.g., with language or social science subjects) and reasoning abilities (e.g., with mathematics or natural science subjects).
+# For additional details such as datasets, evaluation criteria, and reproducibility, please refer to the "📝 About" tab.
+# Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
+# """
 # Which evaluations are you running? how can people reproduce what you have?
 LLM_BENCHMARKS_TEXT = f"""
 # About
 Even though large language models (LLMs) have shown impressive performance on various benchmarks for English, their performance on Southeast Asian (SEA) languages is still underexplored. This leaderboard aims to evaluate LLMs on exam-type benchmarks for English, Chinese and SEA languages, focusing on world knowledge and reasoning abilities. The five languages for evaluation are English (en), Chinese (zh), Indonesian (id), Thai (th), and Vietnamese (vi).
+Stay tuned for the *SeaBench leaderboard* - focusing on evaluating the model's ability to respond to general human instructions in real-world multi-turn settings.
 ## Datasets
 The benchmark data can be found in the [SeaExam dataset](https://huggingface.co/datasets/SeaLLMs/SeaExam). The dataset consists of two tasks:
 - [**M3Exam**](https://arxiv.org/abs/2306.05179): a benchmark sourced from real and official human exam questions for evaluating LLMs in a multilingual, multimodal, and multilevel context. We post-process the data for the 5 languages.

src/leaderboard/load_results.py CHANGED Viewed

@@ -28,7 +28,7 @@ def make_clickable_model(model_name, link=None):
     if len(model_name.split("/")) == 2:
         link = "https://huggingface.co/" + model_name
         return (
-            f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name}</a>'
         )
     return model_name
@@ -36,7 +36,7 @@ def load_data(data_path):
     df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
     columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
-    columns_sorted = ['R','type', 'Model', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
     df_m3exam = df.iloc[:, :11]  # M3Exam columns

     if len(model_name.split("/")) == 2:
         link = "https://huggingface.co/" + model_name
         return (
+            f'<a target="_blank" style="text-decoration: underline" href="{link}">{model_name.split("/")[-1]}</a>'
         )
     return model_name
     df = pd.read_csv(data_path, skiprows=1, header=0).dropna()
     columns = ['Model', 'type', 'open?', 'shot', 'en', 'zh', 'id', 'th', 'vi', 'avg', 'avg_sea']
+    columns_sorted = ['R','type', 'Model','open?', 'avg_sea', 'en', 'zh', 'id', 'th', 'vi', 'avg']
     # Splitting into three separate DataFrames based on the groups M3Exam and MMLU and average
     df_m3exam = df.iloc[:, :11]  # M3Exam columns