brainz commited on
Commit
d40c120
·
1 Parent(s): 2f54fcf

Update space

Browse files
.idea/.gitignore ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Default ignored files
2
+ /shelf/
3
+ /workspace.xml
4
+ # Editor-based HTTP Client requests
5
+ /httpRequests/
6
+ # Datasource local storage ignored files
7
+ /dataSources/
8
+ /dataSources.local.xml
.idea/Multilingual-MMLU-Benchmark-Leaderboard.iml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$" />
5
+ <orderEntry type="inheritedJdk" />
6
+ <orderEntry type="sourceFolder" forTests="false" />
7
+ </component>
8
+ <component name="PyDocumentationSettings">
9
+ <option name="format" value="PLAIN" />
10
+ <option name="myDocStringFormat" value="Plain" />
11
+ </component>
12
+ </module>
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/Multilingual-MMLU-Benchmark-Leaderboard.iml" filepath="$PROJECT_DIR$/.idea/Multilingual-MMLU-Benchmark-Leaderboard.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="" vcs="Git" />
5
+ </component>
6
+ </project>
README.md CHANGED
@@ -1,45 +1,100 @@
1
- ---
2
- title: Multilingual MMLU Benchmark Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- app_file: app.py
8
- pinned: true
9
- license: apache-2.0
10
- short_description: Multilingual MMLU Benchmark Leaderboard
11
- ---
12
-
13
- # Start the configuration
14
-
15
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
16
-
17
- Results files should have the following format and be stored as json files:
18
- ```json
19
- {
20
- "config": {
21
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
22
- "model_name": "path of the model on the hub: org/model",
23
- "model_sha": "revision on the hub",
24
- },
25
- "results": {
26
- "task_name": {
27
- "metric_name": score,
28
- },
29
- "task_name2": {
30
- "metric_name": score,
31
- }
32
- }
33
- }
34
- ```
35
-
36
- Request files are created automatically by this tool.
37
-
38
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
39
-
40
- # Code logic for more complex edits
41
-
42
- You'll find
43
- - the main table' columns names and properties in `src/display/utils.py`
44
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
45
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Multilingual MMLU Benchmark Leaderboard
2
+
3
+ Welcome to the **Multilingual MMLU Benchmark Leaderboard**! This leaderboard is designed to assess the performance of both open-source and closed-source language models on the **Multilingual MMLU (Massive Multitask Language Understanding)** benchmark. The benchmark evaluates the memorization, reasoning, and linguistic capabilities of models across a wide range of languages, making it a crucial tool for comparing multilingual AI performance.
4
+
5
+ ## Overview
6
+
7
+ The **Multilingual MMLU Benchmark** is a comprehensive evaluation platform for AI models, assessing their general knowledge and ability to handle diverse tasks across **57 distinct domains**. These tasks range from elementary-level knowledge to more advanced subjects such as law, physics, history, and computer science. With this leaderboard, we aim to provide an accessible and reliable way for developers, researchers, and organizations to compare language models' multilingual understanding and reasoning abilities.
8
+
9
+ ## Evaluation Scope
10
+
11
+ ### Key Features:
12
+ - **Multilingual Coverage**: The leaderboard evaluates language models on **14 different languages**, including widely spoken languages such as English, Spanish, Arabic, and Chinese, as well as languages with fewer resources like Swahili and Yoruba.
13
+ - **Diverse Domains**: The benchmark includes tasks across 57 domains, ensuring that models are tested on a wide range of topics, from elementary knowledge to complex professional fields.
14
+ - **Comprehensive QA Tasks**: The evaluation focuses on **Question Answering (QA)** tasks, where models answer questions based on general knowledge in various domains. This helps assess both the depth and breadth of the model's knowledge.
15
+
16
+ ### Languages Covered:
17
+ The leaderboard includes models evaluated on the following languages:
18
+
19
+ - **AR_XY**: Arabic
20
+ - **BN_BD**: Bengali
21
+ - **DE_DE**: German
22
+ - **ES_LA**: Spanish
23
+ - **FR_FR**: French
24
+ - **HI_IN**: Hindi
25
+ - **ID_ID**: Indonesian
26
+ - **IT_IT**: Italian
27
+ - **JA_JP**: Japanese
28
+ - **KO_KR**: Korean
29
+ - **PT_BR**: Brazilian Portuguese
30
+ - **SW_KE**: Swahili
31
+ - **YO_NG**: Yoruba
32
+ - **ZH_CN**: Simplified Chinese
33
+
34
+ ## How It Works
35
+
36
+ ### Submitting Models
37
+ To evaluate your model on the Multilingual MMLU Benchmark, you can submit it through the **"Submit here"** tab on the leaderboard. The evaluation process will run your model through a series of tests across the 57 domains in the 14 supported languages. Results will be provided on the leaderboard, with detailed scores for each language and domain.
38
+
39
+ ### Evaluation Process
40
+ We use the **OpenCompass framework** to automate the evaluation process. This framework enables efficient execution of multiple tests across different languages and domains, ensuring scalability and reproducibility. The following is the evaluation setup:
41
+
42
+ - **Evaluation Method**: All tasks are evaluated using a 5-shot setting.
43
+ - **Normalization**: Results are normalized using the following formula:
44
+ ```plaintext
45
+ normalized_value = (raw_value - random_baseline) / (max_value - random_baseline)
46
+
47
+ ## Evaluation Details
48
+
49
+ ### For Generative Tasks:
50
+ - **Random Baseline**: `random_baseline = 0`
51
+
52
+ ### For Multiple-Choice QA Tasks:
53
+ - **Random Baseline**: `random_baseline = 1/n`, where `n` is the number of choices.
54
+
55
+ ### Aggregated Results:
56
+ Scores for each language are averaged across all tasks to provide a comprehensive model evaluation.
57
+
58
+ ## Results
59
+
60
+ - **Numerical Results**: You can access the detailed evaluation results in the [results dataset](#).
61
+ - **Community Queries**: Track ongoing tasks and request status in the [requests dataset](#).
62
+
63
+ ## Reproducibility
64
+
65
+ To ensure reproducibility, we provide a fork of the **lm_eval** framework, which allows you to recreate the evaluation setup and results on your own models. While not all contributions are integrated into the main repository, our fork contains all the necessary updates for evaluation.
66
+
67
+ For detailed setup instructions and to reproduce results on your local machine, please refer to our [lm_eval fork](#).
68
+
69
+ ## Acknowledgements
70
+
71
+ This leaderboard was developed as part of the **#ProjectName**, led by **[OrganizationName]**, and has benefited from the support of high-quality evaluation datasets donated by the following institutions:
72
+
73
+ - [Institution 1]
74
+ - [Institution 2]
75
+ - [Institution 3]
76
+ - [Institution 4]
77
+ - [Institution 5]
78
+ - [Institution 6]
79
+ - [Institution 7]
80
+
81
+ We also thank **[Institution1]**, **[Institution2]**, and **[Institution3]** for sponsoring inference GPUs, which were crucial for running the large-scale evaluations.
82
+
83
+ ### Special Thanks to the Contributors:
84
+ - **Task Implementation**: [Name1], [Name2], [Name3]
85
+ - **Leaderboard Implementation**: [Name4], [Name5]
86
+ - **Model Evaluation**: [Name6], [Name7]
87
+ - **Communications**: [Name8], [Name9]
88
+ - **Organization & Collaboration**: [Name10], [Name11], [Name12]
89
+
90
+ For more information on the datasets, please refer to the **Dataset Cards** available in the "Tasks" tab and the **Citations** section below.
91
+
92
+ ## Collaborate
93
+
94
+ We are always looking for collaborators to expand the range of evaluated models and datasets. If you would like us to include your evaluation dataset or contribute in any other way, please feel free to get in touch!
95
+
96
+ Your feedback, suggestions, and contributions are more than welcome! Visit our **[Community Page]** to share your thoughts, or feel free to open a pull request (PR).
97
+
98
+ Thank you for your interest in the **Multilingual MMLU Benchmark Leaderboard**. Let’s work together to advance multilingual AI capabilities!
99
+
100
+ For more details on dataset authors and dataset cards, please refer to the "Tasks" tab.
app.py CHANGED
@@ -8,8 +8,12 @@ from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
 
11
  INTRODUCTION_TEXT,
 
12
  LLM_BENCHMARKS_TEXT,
 
 
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
@@ -32,7 +36,7 @@ from src.submission.submit import add_new_eval
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
- ### Space initialisation
36
  try:
37
  print(EVAL_REQUESTS_PATH)
38
  snapshot_download(
@@ -93,18 +97,25 @@ demo = gr.Blocks(css=custom_css)
93
  with demo:
94
  gr.HTML(TITLE)
95
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
 
96
 
97
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
98
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
99
  leaderboard = init_leaderboard(LEADERBOARD_DF)
100
 
101
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
102
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
 
 
103
 
104
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
105
  with gr.Column():
106
  with gr.Row():
107
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
 
 
 
108
 
109
  with gr.Column():
110
  with gr.Accordion(
@@ -197,6 +208,15 @@ with demo:
197
  elem_id="citation-button",
198
  show_copy_button=True,
199
  )
 
 
 
 
 
 
 
 
 
200
 
201
  scheduler = BackgroundScheduler()
202
  scheduler.add_job(restart_space, "interval", seconds=1800)
 
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
  EVALUATION_QUEUE_TEXT,
11
+ EVALUATION_QUEUE_TEXT_ZH,
12
  INTRODUCTION_TEXT,
13
+ INTRODUCTION_TEXT_ZH,
14
  LLM_BENCHMARKS_TEXT,
15
+ LLM_BENCHMARKS_TEXT_ZH,
16
+ LOGOS,
17
  TITLE,
18
  )
19
  from src.display.css_html_js import custom_css
 
36
  def restart_space():
37
  API.restart_space(repo_id=REPO_ID)
38
 
39
+ ## Space initialisation
40
  try:
41
  print(EVAL_REQUESTS_PATH)
42
  snapshot_download(
 
97
  with demo:
98
  gr.HTML(TITLE)
99
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
100
+ gr.Markdown(INTRODUCTION_TEXT_ZH, elem_classes="markdown-text")
101
 
102
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
103
  with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
104
  leaderboard = init_leaderboard(LEADERBOARD_DF)
105
 
106
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
107
+ with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
108
+ gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
109
+ with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
110
+ gr.Markdown(LLM_BENCHMARKS_TEXT_ZH, elem_classes="markdown-text")
111
 
112
  with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
113
  with gr.Column():
114
  with gr.Row():
115
+ with gr.TabItem("EN", elem_id="llm-benchmark-tab-table", id=1):
116
+ gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
117
+ with gr.TabItem("ZH", elem_id="llm-benchmark-tab-table", id=2):
118
+ gr.Markdown(EVALUATION_QUEUE_TEXT_ZH, elem_classes="markdown-text")
119
 
120
  with gr.Column():
121
  with gr.Accordion(
 
208
  elem_id="citation-button",
209
  show_copy_button=True,
210
  )
211
+ with gr.Row():
212
+ for logo_path in LOGOS:
213
+ gr.Image(
214
+ value=logo_path,
215
+ show_label=False,
216
+ show_download_button=False,
217
+ show_share_button=False,
218
+ show_fullscreen_button=False,
219
+ )
220
 
221
  scheduler = BackgroundScheduler()
222
  scheduler.add_job(restart_space, "interval", seconds=1800)
logo/CAIS.png ADDED
logo/HuggingFace.png ADDED
logo/logo_qwen.jpg ADDED
logo/logo_qwen.png ADDED
logo/openai-logo.png ADDED
src/about.py CHANGED
@@ -12,31 +12,255 @@ class Task:
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
- task0 = Task("anli_r1", "acc", "ANLI")
16
- task1 = Task("logiqa", "acc_norm", "LogiQA")
17
-
18
- NUM_FEWSHOT = 0 # Change with your few shot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # ---------------------------------------------------
20
 
21
 
22
 
23
  # Your leaderboard name
24
- TITLE = """<h1 align="center" id="space-title">Demo leaderboard</h1>"""
25
 
26
  # What does your leaderboard evaluate?
27
  INTRODUCTION_TEXT = """
28
- Intro text
 
 
 
29
  """
30
-
31
  # Which evaluations are you running? how can people reproduce what you have?
32
- LLM_BENCHMARKS_TEXT = f"""
33
- ## How it works
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- ## Reproducibility
36
- To reproduce our results, here is the commands you can run:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
  """
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  EVALUATION_QUEUE_TEXT = """
41
  ## Some good practices before submitting a model
42
 
@@ -70,3 +294,38 @@ If everything is done, check you can launch the EleutherAIHarness on your model
70
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
71
  CITATION_BUTTON_TEXT = r"""
72
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  # ---------------------------------------------------
13
  class Tasks(Enum):
14
  # task_key in the json file, metric_key in the json file, name to display in the leaderboard
15
+ task0 = Task("mmmlu", "acc", "MMMLU")
16
+ task1 = Task("mmlu", "acc", "MMLU")
17
+ task2 = Task("cmmlu", "acc", "CMMLU")
18
+ task3 = Task("mmmlu_ar", "acc", "MMMLU_AR")
19
+ task4 = Task("mmmlu_bn", "acc", "MMMLU_BN")
20
+ task5 = Task("mmmlu_de", "acc", "MMMLU_DE")
21
+ task6 = Task("mmmlu_es", "acc", "MMMLU_ES")
22
+ task7 = Task("mmmlu_fr", "acc", "MMMLU_FR")
23
+ task8 = Task("mmmlu_hi", "acc", "MMMLU_HI")
24
+ task9 = Task("mmmlu_id", "acc", "MMMLU_ID")
25
+ task10 = Task("mmmlu_it", "acc", "MMMLU_IT")
26
+ task11 = Task("mmmlu_ja", "acc", "MMMLU_JA")
27
+ task12 = Task("mmmlu_ko", "acc", "MMMLU_KO")
28
+ task13 = Task("mmmlu_pt", "acc", "MMMLU_PT")
29
+ task14 = Task("mmmlu_sw", "acc", "MMMLU_SW")
30
+ task15 = Task("mmmlu_yo", "acc", "MMMLU_YO")
31
+ task16 = Task("mmmlu_zh", "acc", "MMMLU_ZH")
32
+ NUM_FEWSHOT = 5 # Change with your few shot
33
  # ---------------------------------------------------
34
 
35
 
36
 
37
  # Your leaderboard name
38
+ TITLE = """<h1 align="center" id="space-title">Multilingual MMLU Benchmark Leaderboard</h1>"""
39
 
40
  # What does your leaderboard evaluate?
41
  INTRODUCTION_TEXT = """
42
+ **Multilingual MMLU Benchmark Leaderboard:** This leaderboard is dedicated to evaluating the performance of both open-source and closed-source language models on the Multilingual MMLU benchmark. It assesses their memorization, reasoning, and linguistic capabilities across a wide range of languages. The leaderboard consolidates multiple MMLU datasets, originally created or manually translated into various languages, to provide a comprehensive evaluation of multilingual understanding in LLMs.
43
+ """
44
+ INTRODUCTION_TEXT_ZH = """
45
+ **多语言 MMLU 基准榜单:** 这是一个开放的评测榜单,旨在评估开源和闭源语言模型在多语言 MMLU 基准测试中的表现,涵盖记忆、推理和语言能力。该榜单整合了多个 MMLU 数据集,这些数据集最初为多种语言创建或手动翻译,旨在全面评估大规模语言模型在多语言理解上的能力。
46
  """
 
47
  # Which evaluations are you running? how can people reproduce what you have?
48
+ # TODO: Update number of benchmarks
49
+ LLM_BENCHMARKS_TEXT = """
50
+ ## 💡 About "Multilingual Benchmark MMLU Leaderboard"
51
+
52
+ - Press release: [TBD - XXX](#), [TBD - XXX](#), [TBD - XXX](#), [TBD - XXX](#)
53
+ - YouTube: [TBD - XXX](#)
54
+
55
+ ### Overview
56
+ The **Multilingual Massive Multitask Language Understanding (MMMLU)** benchmark is a comprehensive evaluation platform designed to assess the general knowledge capabilities of AI models across a wide range of domains. It includes a series of **Question Answering (QA)** tasks across **57 distinct domains**, ranging from elementary-level knowledge to advanced professional subjects such as law, physics, history, and computer science.
57
+
58
+ ### Translation Effort
59
+ For this evaluation, we used the **OpenAI MMMLU dataset**, which has been extensively curated and tested for a multilingual understanding of AI models. The dataset includes 14 different languages and is specifically designed to assess how well AI models can handle a wide range of general knowledge tasks across 57 domains.
60
+
61
+ While the translation of the test set was performed by OpenAI, it ensures a high level of accuracy and reliability for evaluating multilingual models. By leveraging this pre-existing, professionally curated dataset, we aim to focus on model performance across multiple languages, without the need for additional translations from our side.
62
+
63
+ ### Commitment to Multilingual AI
64
+ By focusing on human-powered translations and publishing both the translated test sets and evaluation code, we aim to promote the development of AI models that can handle multilingual tasks with greater accuracy. This reflects our commitment to improving AI’s performance in underrepresented languages and making technology more inclusive and effective globally.
65
+
66
+ ### Locales Covered
67
+ The MMMLU benchmark includes a test set translated into the following locales:
68
+ - **AR_XY**: Arabic
69
+ - **BN_BD**: Bengali
70
+ - **DE_DE**: German
71
+ - **ES_LA**: Spanish
72
+ - **FR_FR**: French
73
+ - **HI_IN**: Hindi
74
+ - **ID_ID**: Indonesian
75
+ - **IT_IT**: Italian
76
+ - **JA_JP**: Japanese
77
+ - **KO_KR**: Korean
78
+ - **PT_BR**: Brazilian Portuguese
79
+ - **SW_KE**: Swahili
80
+ - **YO_NG**: Yoruba
81
+ - **ZH_CN**: Simplified Chinese
82
+
83
+ ### Purpose
84
+ The MMMLU Leaderboard aims to provide a unified benchmark for comparing AI model performance across these multiple languages and diverse domains. With the inclusion of the **QA task** across **57 domains**, it evaluates how well models perform in answering general knowledge questions in multiple languages, ensuring a high standard of multilingual understanding and reasoning.
85
+
86
+ ### Goals
87
+ Our primary goal is to provide a reliable comparison for AI models across different languages and domains, helping developers and researchers evaluate and improve their models’ multilingual capabilities. By emphasizing high-quality translations and including a broad range of topics, we strive to make AI models more robust and useful across diverse communities worldwide.
88
+
89
+ ### 🤗 How it works
90
+
91
+ Submit a model for automated evaluation on our clusters on the "Submit here" tab!
92
+
93
+ ### 📈 Tasks
94
+
95
+ We evaluate models on a variety of key benchmarks, with a focus on **Multilingual Massive Multitask Language Understanding (MMLU)** and its variants, including MMLU, C-MMLU, ArabicMMLU, KoreanMMLU, MalayMMLU, and others. These benchmarks assess general knowledge across a wide range of topics from 57 categories, such as law, physics, history, and computer science.
96
+
97
+ The evaluation is performed using the [OpenCompass framework](https://github.com/open-compass/opencompass), a unified platform for evaluating language models across multiple tasks. OpenCompass allows us to execute these evaluations efficiently and at scale, covering multiple languages and benchmarks.
98
+
99
+ For detailed information on the tasks, please refer to the "Tasks" tab in the OpenCompass framework.
100
+
101
+ Notes:
102
+ - The evaluations are all 5-shot.
103
+ - The results are normalized with the following formula: `normalized_value = (raw_value - random_baseline) / (max_value - random_baseline)`, where `random_baseline` is `0` for generative tasks and `1/n` for multi-choice QA with `n` choices.
104
+ - Results are aggregated by calculating the average of all the tasks for a given language.
105
+
106
+ ### 🔎 Results
107
+
108
+ You can find:
109
+
110
+ - Detailed numerical results in the [results dataset](link_to_results)
111
+ - Community queries and running status in the [requests dataset](link_to_requests)
112
 
113
+ ### Reproducibility
114
+
115
+ To reproduce the results, you can use [our fork of lm_eval](#), as not all of our PRs are currently integrated into the main repository.
116
+
117
+ ## 🙌 Acknowledgements
118
+
119
+ This leaderboard was developed as part of the [#ProjectName](link_to_project) led by [OrganizationName](link_to_organization) thanks to the donation of high-quality evaluation datasets by:
120
+
121
+ - [Institution 1](link_to_institution_1)
122
+ - [Institution 2](link_to_institution_2)
123
+ - [Institution 3](link_to_institution_3)
124
+ - [Institution 4](link_to_institution_4)
125
+ - [Institution 5](link_to_institution_5)
126
+ - [Institution 6](link_to_institution_6)
127
+ - [Institution 7](link_to_institution_7)
128
+ - [Institution 8](link_to_institution_8)
129
+ - [Institution 9](link_to_institution_9)
130
+
131
+ The entities above are ordered chronologically by the date they joined the project. However, the logos in the footer are ordered by the number of datasets donated.
132
+
133
+ Thank you in particular to:
134
+ - Task implementation: [Name 1], [Name 2], [Name 3], [Name 4], [Name 5], [Name 6], [Name 7], [Name 8], [Name 9], [Name 10]
135
+ - Leaderboard implementation: [Name 11], [Name 12]
136
+ - Model evaluation: [Name 13], [Name 14], [Name 15], [Name 16], [Name 17]
137
+ - Communication: [Name 18], [Name 19]
138
+ - Organization & colab leads: [Name 20], [Name 21], [Name 22], [Name 23], [Name 24], [Name 25], [Name 26], [Name 27], [Name 28], [Name 29], [Name 30]
139
+
140
+ For information about the dataset authors please check the corresponding Dataset Cards (linked in the "Tasks" tab) and papers (included in the "Citation" section below). We would like to specially thank the teams that created or open-sourced their datasets specifically for the leaderboard (in chronological order):
141
+ - [Dataset1 Placeholder] and [Dataset2 Placeholder]: [Team members placeholder]
142
+ - [Dataset3 Placeholder], [Dataset4 Placeholder] and [Dataset5 Placeholder]: [Team members placeholder]
143
+ - [Dataset6 Placeholder]: [Team members placeholder]
144
+
145
+ We also thank [Institution1 Placeholder], [Institution2 Placeholder], [Organization Placeholder], [Person1 Placeholder], [Person2 Placeholder] and [Institution3 Placeholder] for sponsoring the inference GPUs.
146
+
147
+ ## 🚀 Collaborate!
148
+
149
+ We would like to create a leaderboard as diverse as possible, reach out if you would like us to include your evaluation dataset!
150
+
151
+ Comments and suggestions are more than welcome! Visit the [👏 Community](<Community Page Placeholder>) page, tell us what you think about La Leaderboard and how we can improve it, or go ahead and open a PR!
152
+
153
+ Thank you very much! 💛
154
 
155
  """
156
 
157
+ LLM_BENCHMARKS_TEXT_ZH = """
158
+ ## 💡 关于 "多语言基准 MMLU 排行榜"
159
+
160
+ - 新闻稿:[待定 - XXX](#), [待定 - XXX](#), [待定 - XXX](#), [待定 - XXX](#)
161
+ - YouTube:[待定 - XXX](#)
162
+
163
+ ### 概述
164
+ **多语言大规模多任务语言理解 (MMMLU)** 基准是一个全面的评估平台,旨在评估 AI 模型在各个领域的通用知识能力。它包括一系列跨越 **57 个不同领域** 的 **问答 (QA)** 任务,从基础知识到法律、物理、历史、计算机科学等高级专业���题。
165
+
166
+ ### 翻译工作
167
+ 对于本次评估,我们使用了 **OpenAI MMMLU 数据集**,该数据集已经广泛策划并测试了 AI 模型的多语言理解能力。该数据集包括 14 种不同的语言,专门设计用来评估 AI 模型在 57 个领域中处理各种通用知识任务的能力。
168
+
169
+ 尽管测试集的翻译是由 OpenAI 执行的,但它确保了评估多语言模型的高准确性和可靠性。通过利用这个预先存在的、专业策划的数据集,我们旨在专注于模型在多种语言中的表现,而无需我们额外进行翻译工作。
170
+
171
+ ### 致力于多语言 AI
172
+ 通过专注于人工翻译并公开翻译后的测试集和评估代码,我们旨在促进能够处理多语言任务的 AI 模型的开发,并使其更加准确。这也体现了我们在改善 AI 在低资源语言中的表现以及推动全球技术包容性方面的承诺。
173
+
174
+ ### 涵盖的语言区域
175
+ MMMLU 基准包括以下语言区域的翻译测试集:
176
+ - **AR_XY**:阿拉伯语
177
+ - **BN_BD**:孟加拉语
178
+ - **DE_DE**:德语
179
+ - **ES_LA**:西班牙语
180
+ - **FR_FR**:法语
181
+ - **HI_IN**:印地语
182
+ - **ID_ID**:印尼语
183
+ - **IT_IT**:意大利语
184
+ - **JA_JP**:日语
185
+ - **KO_KR**:韩语
186
+ - **PT_BR**:巴西葡萄牙语
187
+ - **SW_KE**:斯瓦希里语
188
+ - **YO_NG**:约鲁巴语
189
+ - **ZH_CN**:简体中文
190
+
191
+ ### 目的
192
+ MMMLU 排行榜旨在为比较 AI 模型在这些多语言和多领域中的表现提供统一的基准。通过包括 **57 个领域** 中的 **问答任务**,它评估了模型在多语言中回答通用知识问题的能力,确保了多语言理解和推理的高标准。
193
+
194
+ ### 目标
195
+ 我们的主要目标是为 AI 模型在不同语言和领域中的表现提供可靠的比较,帮助开发者和研究人员评估和提高他们模型的多语言能力。通过强调高质量的翻译和包括广泛的主题,我们努力使 AI 模型在全球不同社区中更加稳健和有用。
196
+
197
+ ### 🤗 工作原理
198
+
199
+ 在 "提交这里" 标签页上提交模型进行自动评估!
200
+
201
+ ### 📈 任务
202
+
203
+ 我们评估模型在多个关键基准上的表现,重点关注 **多语言大规模多任务语言理解 (MMLU)** 及其变体,包括 MMLU、C-MMLU、阿拉伯语 MMLU、韩语 MMLU、马来语 MMLU 等。 这些基准评估了来自 57 个类别(如法律、物理、历史和计算机科学等)的一般知识。
204
+
205
+ 评估使用 [OpenCompass 框架](https://github.com/open-compass/opencompass) 执行,该平台统一了对语言模型的多任务评估。OpenCompass 使我们能够高效地、大规模地执行这些评估,覆盖多种语言和基准。
206
+
207
+ 有关任务的详细信息,请参见 OpenCompass 框架中的 "任务" 标签页。
208
+
209
+ 注:
210
+ - 所有评估均为 5-shot 任务。
211
+ - 结果采用以下公式标准化:`normalized_value = (raw_value - random_baseline) / (max_value - random_baseline)`,其中 `random_baseline` 对于生成任务为 `0`,对于多选 QA 为 `1/n`(`n` 为选择数)。
212
+ - 结果通过计算给定语言的所有任务的平均值来汇总。
213
+
214
+ ### 🔎 结果
215
+
216
+ 你可以找到:
217
+
218
+ - 详细的数值结果在 [结果数据集](link_to_results)
219
+ - 社区查询和运行状态在 [请求数据集](link_to_requests)
220
+
221
+ ### ✅ 可复现性
222
+
223
+ 要复现结果,你可以使用 [我们 fork 的 lm_eval](#),因为并非所有的 PR 都已集成到主仓库中。
224
+
225
+ ## 🙌 致谢
226
+
227
+ 这个排行榜是 [#ProjectName](link_to_project) 项目的一部分,由 [OrganizationName](link_to_organization) 领导,感谢以下机构捐赠了高质量的评估数据集:
228
+
229
+ - [机构 1](link_to_institution_1)
230
+ - [机构 2](link_to_institution_2)
231
+ - [机构 3](link_to_institution_3)
232
+ - [机构 4](link_to_institution_4)
233
+ - [机构 5](link_to_institution_5)
234
+ - [机构 6](link_to_institution_6)
235
+ - [机构 7](link_to_institution_7)
236
+ - [机构 8](link_to_institution_8)
237
+ - [机构 9](link_to_institution_9)
238
+
239
+ 这些实体按加入项目的时间顺序排列,然而页脚的 logo 排列顺序是按照捐赠数据集的数量。
240
+
241
+ 特别感谢:
242
+ - 任务实现:[姓名 1],[姓名 2],[姓名 3],[姓名 4],[姓名 5],[姓名 6],[姓名 7],[姓名 8],[姓名 9],[姓名 10]
243
+ - 排行榜实现:[姓名 11],[姓名 12]
244
+ - 模型评估:[姓名 13],[姓名 14],[姓名 15],[姓名 16],[姓名 17]
245
+ - 沟通:[姓名 18],[姓名 19]
246
+ - 组织与协作领导:[姓名 20],[姓名 21],[姓名 22],[姓名 23],[姓名 24],[姓名 25],[姓名 26],[姓名 27],[姓名 28],[姓名 29],[姓名 30]
247
+
248
+ 有关数据集作者的信息,请查看相应的数据集卡片(可以在 "任务" 标签页中找到)以及论文(在 "引用" 部分提供)。我们特别感谢那些为排行榜专门创建或开源其数据集的团队(按时间顺序):
249
+ - [数据集1 占位符] 和 [数据集2 占位符]: [团队成员占位符]
250
+ - [数据集3 占位符],[数据集4 占位符] 和 [数据集5 占位符]: [团队成员占位符]
251
+ - [数据集6 占位符]: [团队成员占位符]
252
+
253
+ 我们还感谢 [机构1 占位符],[机构2 占位符],[组织占位符],[人员1 占位符],[人员2 占位符] 和 [机构3 占位符] 提供推理 GPU 支持。
254
+
255
+ ## 🚀 合作!
256
+
257
+ 我们希望创建一个尽可能多样化的排行榜,欢迎联系我们如果你希望我们将你的评估数据集包含在内!
258
+
259
+ 评论和建议非常欢迎!请访问 [👏 社区](<Community Page Placeholder>) 页面,告诉我们你对 La 排行榜的看法以及我们如何改进,或者直接打开一个 PR!
260
+
261
+ 非常感谢! 💛
262
+ """
263
+
264
  EVALUATION_QUEUE_TEXT = """
265
  ## Some good practices before submitting a model
266
 
 
294
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
295
  CITATION_BUTTON_TEXT = r"""
296
  """
297
+ EVALUATION_QUEUE_TEXT_ZH = """
298
+ ## 提交模型前的一些良好实践
299
+
300
+ ### 1) 确保你可以使用 AutoClasses 加载你的模型和分词器:
301
+ ```python
302
+ from transformers import AutoConfig, AutoModel, AutoTokenizer
303
+ config = AutoConfig.from_pretrained("your model name", revision=revision)
304
+ model = AutoModel.from_pretrained("your model name", revision=revision)
305
+ tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
306
+ ```
307
+ 如果此步骤失败,请按照错误信息进行调试,可能是你的模型上传不正确。
308
+
309
+ 注意:确保你的模型是公开的! 注意:如果你的模型需要 use_remote_code=True,目前我们不支持该选项,但我们正在努力添加此功能,请保持关注!
310
+
311
+ 2) 将你的模型权重转换为 safetensors
312
+ 这是一个新的权重存储格式,加载和使用时更安全、更快速。它还将允许我们将模型的参数数量添加到 Extended Viewer 中!
313
+
314
+ 3) 确保你的模型具有开放许可!
315
+ 这是一个针对开放 LLM 的排行榜,我们希望尽可能多的人知道他们可以使用你的模型 🤗
316
+
317
+ 4) 填写你的模型卡
318
+ 当我们将额外的信息添加到排行榜时,它将自动从模型卡中获取。
319
+
320
+ 模型失败时的处理
321
+ 如果你的模型出现在 FAILED 分类中,表示其执行停止。 首先确保你已经遵循了上述步骤。 如果一切都完成,检查你是否可以使用上面的命令在本地启动 EleutherAIHarness 来测试你的模型(你可以添加 --limit 来限制每个任务的示例数)。 """
322
+
323
+ CITATION_BUTTON_LABEL = "复制以下代码引用这些结果"
324
+ CITATION_BUTTON_TEXT = r"""
325
+ """
326
+ LOGOS = [
327
+ "logo/HuggingFace.png",
328
+ "logo/openai-logo.png",
329
+ "logo/logo_qwen.png",
330
+ "logo/CAIS.png"
331
+ ]
src/envs.py CHANGED
@@ -6,15 +6,17 @@ from huggingface_hub import HfApi
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 
10
  # ----------------------------------
11
 
12
- REPO_ID = f"{OWNER}/leaderboard"
13
  QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
 
18
 
19
  # Local caches
20
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "Multilingual-MMLU-Benchmark-Leaderboard" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
+ DEVICE = "cpu" # "cuda:0" if you add compute
11
  # ----------------------------------
12
 
13
+ REPO_ID = f"{OWNER}/Multilingual-MMLU-Benchmark-Leaderboard"
14
  QUEUE_REPO = f"{OWNER}/requests"
15
  RESULTS_REPO = f"{OWNER}/results"
16
 
17
  # If you setup a cache later, just change HF_HOME
18
  CACHE_PATH=os.getenv("HF_HOME", ".")
19
+ # print('*******',CACHE_PATH)
20
 
21
  # Local caches
22
  EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
src/leaderboard/read_evals.py CHANGED
@@ -10,7 +10,9 @@ import numpy as np
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
 
13
 
 
14
 
15
  @dataclass
16
  class EvalResult:
@@ -18,14 +20,14 @@ class EvalResult:
18
  """
19
  eval_name: str # org_model_precision (uid)
20
  full_model: str # org/model (path on hub)
21
- org: str
22
  model: str
23
  revision: str # commit hash, "" if main
24
  results: dict
25
  precision: Precision = Precision.Unknown
26
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
27
  weight_type: WeightType = WeightType.Original # Original or Adapter
28
- architecture: str = "Unknown"
29
  license: str = "?"
30
  likes: int = 0
31
  num_params: int = 0
@@ -37,6 +39,7 @@ class EvalResult:
37
  """Inits the result from the specific model result file"""
38
  with open(json_filepath) as fp:
39
  data = json.load(fp)
 
40
 
41
  config = data.get("config")
42
 
@@ -70,7 +73,6 @@ class EvalResult:
70
  results = {}
71
  for task in Tasks:
72
  task = task.value
73
-
74
  # We average all scores of a given metric (not all metrics are present in all files)
75
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
76
  if accs.size == 0 or any([acc is None for acc in accs]):
@@ -85,7 +87,7 @@ class EvalResult:
85
  org=org,
86
  model=model,
87
  results=results,
88
- precision=precision,
89
  revision= config.get("model_sha", ""),
90
  still_on_hub=still_on_hub,
91
  architecture=architecture
@@ -93,7 +95,8 @@ class EvalResult:
93
 
94
  def update_with_request_file(self, requests_path):
95
  """Finds the relevant request file for the current model and updates info with it"""
96
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
 
97
 
98
  try:
99
  with open(request_file, "r") as f:
@@ -109,7 +112,9 @@ class EvalResult:
109
 
110
  def to_dict(self):
111
  """Converts the Eval Result to a dict compatible with our dataframe display"""
112
- average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
 
 
113
  data_dict = {
114
  "eval_name": self.eval_name, # not a column, just a save name,
115
  AutoEvalColumn.precision.name: self.precision.value.name,
@@ -136,6 +141,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
136
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
137
  request_files = os.path.join(
138
  requests_path,
 
139
  f"{model_name}_eval_request_*.json",
140
  )
141
  request_files = glob.glob(request_files)
 
10
  from src.display.formatting import make_clickable_model
11
  from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
12
  from src.submission.check_validity import is_model_on_hub
13
+ # from src.tasks import Categories
14
 
15
+ # tasks_df = pd.read_csv("tasks.csv")
16
 
17
  @dataclass
18
  class EvalResult:
 
20
  """
21
  eval_name: str # org_model_precision (uid)
22
  full_model: str # org/model (path on hub)
23
+ org: str
24
  model: str
25
  revision: str # commit hash, "" if main
26
  results: dict
27
  precision: Precision = Precision.Unknown
28
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
29
  weight_type: WeightType = WeightType.Original # Original or Adapter
30
+ architecture: str = "Unknown"
31
  license: str = "?"
32
  likes: int = 0
33
  num_params: int = 0
 
39
  """Inits the result from the specific model result file"""
40
  with open(json_filepath) as fp:
41
  data = json.load(fp)
42
+ # print('#####',json_filepath)
43
 
44
  config = data.get("config")
45
 
 
73
  results = {}
74
  for task in Tasks:
75
  task = task.value
 
76
  # We average all scores of a given metric (not all metrics are present in all files)
77
  accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
78
  if accs.size == 0 or any([acc is None for acc in accs]):
 
87
  org=org,
88
  model=model,
89
  results=results,
90
+ precision=precision,
91
  revision= config.get("model_sha", ""),
92
  still_on_hub=still_on_hub,
93
  architecture=architecture
 
95
 
96
  def update_with_request_file(self, requests_path):
97
  """Finds the relevant request file for the current model and updates info with it"""
98
+ request_file = get_request_file_for_model(requests_path, self.full_model.split("/")[-1], self.precision.value.name)
99
+ # print("########",request_file)
100
 
101
  try:
102
  with open(request_file, "r") as f:
 
112
 
113
  def to_dict(self):
114
  """Converts the Eval Result to a dict compatible with our dataframe display"""
115
+ keys_to_average = ['mmmlu', 'mmlu', 'cmmlu']
116
+ average = sum([self.results[key] for key in keys_to_average if self.results.get(key) is not None]) / len(
117
+ keys_to_average)
118
  data_dict = {
119
  "eval_name": self.eval_name, # not a column, just a save name,
120
  AutoEvalColumn.precision.name: self.precision.value.name,
 
141
  """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
142
  request_files = os.path.join(
143
  requests_path,
144
+ model_name,
145
  f"{model_name}_eval_request_*.json",
146
  )
147
  request_files = glob.glob(request_files)