Spaces:

kevinpro
/

Open-Multilingual-Reasoning-Leaderboard

Sleeping

App Files Files Community

kevinpro commited on Mar 11

Commit

556657e

•

1 Parent(s): 8cc8064

commit message

Browse files

Files changed (6) hide show

README.md +0 -12
__pycache__/content.cpython-38.pyc +0 -0
__pycache__/css.cpython-38.pyc +0 -0
app.py +183 -0
content.py +52 -0
css.py +15 -0

README.md CHANGED Viewed

@@ -1,12 +0,0 @@
----
-title: Open Multilingual Reasoning Leaderboard
-emoji: 💻
-colorFrom: pink
-colorTo: green
-sdk: gradio
-sdk_version: 4.21.0
-app_file: app.py
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/content.cpython-38.pyc ADDED Viewed

Binary file (1.34 kB). View file

__pycache__/css.cpython-38.pyc ADDED Viewed

Binary file (416 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import os
+import json
+import glob
+from collections import defaultdict
+import pandas as pd
+import gradio as gr
+from content import *
+from css import *
+import glob
+ARC = "arc"
+HELLASWAG = "hellaswag"
+MMLU = "mmlu"
+TRUTHFULQA = "truthfulqa"
+BENCHMARKS = [ARC, HELLASWAG, MMLU, TRUTHFULQA]
+METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
+LANGS = 'ar,bn,ca,da,de,es,eu,fr,gu,hi,hr,hu,hy,id,it,kn,ml,mr,ne,nl,pt,ro,ru,sk,sr,sv,ta,te,uk,vi,zh'.split(',')
+LANG_NAME = {
+    'ar': 'Arabic',
+    'bn': 'Bengali',
+    'ca': 'Catalan',
+    'da': 'Danish',
+    'de': 'German',
+    'es': 'Spanish',
+    'eu': 'Basque',
+    'fr': 'French',
+    'gu': 'Gujarati',
+    'hi': 'Hindi',
+    'hr': 'Croatian',
+    'hu': 'Hungarian',
+    'hy': 'Armenian',
+    'id': 'Indonesian',
+    'it': 'Italian',
+    'kn': 'Kannada',
+    'ml': 'Malayalam',
+    'mr': 'Marathi',
+    'ne': 'Nepali',
+    'nl': 'Dutch',
+    'pt': 'Portuguese',
+    'ro': 'Romanian',
+    'ru': 'Russian',
+    'sk': 'Slovak',
+    'sr': 'Serbian',
+    'sv': 'Swedish',
+    'ta': 'Tamil',
+    'te': 'Telugu',
+    'uk': 'Ukrainian',
+    'vi': 'Vietnamese',
+    'zh': 'Chinese'
+}
+MODEL_COL = "Model"
+LANG_COL = "Language"
+CODE_COL = "Code"
+AVERAGE_COL = "Average"
+ARC_COL = "ARC (25-shot)"
+MGSM_COL = "MGSM"
+MSVAMP_COL = "MSVAMP"
+MNUM_COL = "MNumGLUESub"
+HELLASWAG_COL = "HellaSwag (0-shot)️"
+MMLU_COL = "MMLU (25-shot)"
+TRUTHFULQA_COL = "TruthfulQA (0-shot)"
+NOTES_COL = "Notes"  # For search only
+COLS = [MODEL_COL, LANG_COL, CODE_COL, AVERAGE_COL, ARC_COL, HELLASWAG_COL, MMLU_COL, TRUTHFULQA_COL, NOTES_COL]
+TYPES = ["str", "str", "str", "number", "number", "number", "number", "number", "str"]
+COLS = [MODEL_COL, MSVAMP_COL, MGSM_COL, MNUM_COL,NOTES_COL]
+TYPES = ["str", "number", "number", "number","str"]
+def get_leaderboard_df():
+    df = list()
+    results = [
+        ["GPT-3.5-Turbo", 46.6, 42.2, 49.4],
+        ["MAmmoTH", 26.3, 21.3, 24.2],
+        ["WizardMath", 32.5, 23.0, 28.7],
+        ["MetaMath", 46.2, 37.0, 43.2],
+        ["QAlign", 57.2, 49.6, 0],
+        ["MathOctopus", 41.2, 39.5, 37.1],
+        ["MathOctopus-MAPO-DPO(ours)🔥", 57.4, 41.6, 50.4],
+        ["MetaMathOctopus", 53.0, 45.5, 39.2],
+        ["MetaMathOctopus-MAPO-DPO(ours) 👑", 64.7, 51.6, 52.9],
+        ["MistralMathOctopus", 59.0, 58.0, 56.8],
+        ["MistralMathOctopus-MAPO-DPO(ours) 👑", 74.6, 67.3, 70.0]
+    ]
+    # for (pretrained, lang), perfs in performance_dict.items():
+    #     lang_name = LANG_NAME[lang]
+    #     arc_perf = perfs.get(ARC, 0.0)
+    #     hellaswag_perf = perfs.get(HELLASWAG, 0.0)
+    #     mmlu_perf = perfs.get(MMLU, 0.0)
+    #     truthfulqa_perf = perfs.get(TRUTHFULQA, 0.0)
+    #     if arc_perf * hellaswag_perf * mmlu_perf * truthfulqa_perf == 0:
+    #         continue
+    #     avg = round((arc_perf + hellaswag_perf + mmlu_perf + truthfulqa_perf) / 4, 1)
+    #     notes = ' '.join([pretrained, lang_name])
+    #     row = [pretrained, lang_name, lang, avg, arc_perf, hellaswag_perf, mmlu_perf, truthfulqa_perf, notes]
+    #     df.append(row)
+    for i in results:
+        i.append("NOTE")
+    df = pd.DataFrame.from_records(results, columns=COLS)
+    df = df.sort_values(by=[ MSVAMP_COL], ascending=False)
+    df = df[COLS]
+    return df
+def search_table(df, query):
+    filtered_df = df[df[NOTES_COL].str.contains(query, case=False)]
+    return filtered_df
+original_df = get_leaderboard_df()
+demo = gr.Blocks(css=CUSTOM_CSS)
+with demo:
+    gr.HTML(TITLE)
+    gr.Markdown(INTRO_TEXT, elem_classes="markdown-text")
+    #gr.Markdown(HOW_TO, elem_classes="markdown-text")
+    with gr.Box():
+        search_bar = gr.Textbox(
+            placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
+        )
+        leaderboard_table = gr.components.Dataframe(
+            value=original_df,
+            headers=COLS,
+            datatype=TYPES,
+            max_rows=5,
+            elem_id="leaderboard-table",
+        )
+        # # Dummy leaderboard for handling the case when the user uses backspace key
+        hidden_leaderboard_table_for_search = gr.components.Dataframe(
+            value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
+        )
+        search_bar.change(
+            search_table,
+            [hidden_leaderboard_table_for_search, search_bar],
+            leaderboard_table,
+        )
+    with gr.Box():
+        search_bar = gr.Textbox(
+            placeholder="Search models and languages...", show_label=False, elem_id="search-bar"
+        )
+        leaderboard_table = gr.components.Dataframe(
+            value=original_df,
+            headers=COLS,
+            datatype=TYPES,
+            max_rows=5,
+            elem_id="leaderboard-table",
+        )
+        # # Dummy leaderboard for handling the case when the user uses backspace key
+        hidden_leaderboard_table_for_search = gr.components.Dataframe(
+            value=original_df, headers=COLS, datatype=TYPES, max_rows=5, visible=False
+        )
+        search_bar.change(
+            search_table,
+            [hidden_leaderboard_table_for_search, search_bar],
+            leaderboard_table,
+        )
+    #gr.Markdown(CREDIT, elem_classes="markdown-text")
+    gr.Markdown(CITATION, elem_classes="markdown-text")
+demo.launch()

content.py ADDED Viewed

	@@ -0,0 +1,52 @@

+TITLE = '<h1 align="center" id="space-title">Open Multilingual Reasoning Leaderboard</h1>'
+INTRO_TEXT = f"""
+## About
+This leaderboard tracks progress and ranks reasoning performance of large language models (LLMs) developed for different languages,
+emphasizing on non-English languages to democratize benefits of LLMs to broader society.
+Our current leaderboard provides evaluation data for 10 languages.
+Both multilingual and language-specific LLMs are welcome in this leaderboard.
+We currently evaluate models over four benchmarks:
+- <a href="https://huggingface.co/datasets/Mathoctopus/MSVAMP" target="_blank">  MSVAMP </a>
+- <a href="https://huggingface.co/datasets/juletxara/mgsm" target="_blank">  MGSM </a>
+- <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MNumGLUESub </a>
+# """
+# HOW_TO = f"""
+# ## How to list your model performance on this leaderboard:
+# Run the evaluation of your model using this repo: <a href="https://github.com/nlp-uoregon/mlmm-evaluation" target="_blank">https://github.com/nlp-uoregon/mlmm-evaluation</a>.
+# And then, push the evaluation log and make a pull request.
+# """
+# CREDIT = f"""
+# ## Credit
+# To make this website, we use the following resources:
+# - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA)
+# - Funding and GPU access (Adobe Research)
+# - Evaluation code (EleutherAI's lm_evaluation_harness repo)
+# - Leaderboard code (Huggingface4's open_llm_leaderboard repo)
+# """
+CITATION = f"""
+## Citation
+```
+@misc{{she2024mapo,
+      title={{MAPO: Advancing Multilingual Reasoning through Multilingual Alignment-as-Preference Optimization}},
+      author={{Shuaijie She and Wei Zou and Shujian Huang and Wenhao Zhu and Xiang Liu and Xiang Geng and Jiajun Chen}},
+      year={{2024}},
+      eprint={{2401.06838}},
+      archivePrefix={{arXiv}},
+      primaryClass={{cs.CL}}
+}}
+```
+"""

css.py ADDED Viewed

	@@ -0,0 +1,15 @@

+CUSTOM_CSS = """
+/* Hides the final column */
+table td:last-child,
+table th:last-child {
+    display: none;
+}
+/* 控制第一列的宽度 */
+table td:first-child,
+table th:first-child {
+    max-width: 200px;
+    overflow: auto;
+    white-space: nowrap;
+}
+"""