File size: 8,950 Bytes
e353a82
 
 
 
 
 
 
 
 
 
3388e82
e353a82
c66f0bf
5d9b035
 
 
 
3388e82
5d9b035
 
3388e82
5d9b035
 
 
e353a82
 
 
5d9b035
3388e82
f935a66
5d9b035
f935a66
39793ac
3388e82
f61bd87
 
 
 
 
 
4cdb30d
11c0693
f935a66
4cdb30d
f935a66
4cdb30d
 
 
f61bd87
f935a66
86c5f36
 
 
 
 
 
f61bd87
8d679bb
 
f935a66
 
8d679bb
f935a66
e353a82
 
 
f935a66
208c50b
 
 
 
 
f935a66
 
e353a82
965f42a
e353a82
f935a66
e353a82
 
e107e99
 
 
 
 
 
 
 
 
 
 
 
e353a82
6711aa6
e353a82
3388e82
 
e353a82
 
 
 
a7c29bf
3388e82
a7c29bf
e353a82
3388e82
e353a82
 
 
 
 
 
3388e82
bb25558
3388e82
bb25558
3388e82
 
e353a82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324387b
 
 
3388e82
ddc4b67
 
 
 
 
1f5abbf
ddc4b67
 
 
324387b
ddc4b67
324387b
ddc4b67
324387b
 
ddc4b67
 
 
 
 
c66f0bf
ddc4b67
 
 
 
1f5abbf
ddc4b67
324387b
ddc4b67
1f5abbf
 
617d783
 
 
 
3388e82
1f5abbf
ddc4b67
 
 
 
 
 
 
1f5abbf
ddc4b67
 
a7c29bf
 
 
5d9b035
67f06d3
a7c29bf
324387b
 
 
67f06d3
1f5abbf
ec33066
 
 
 
 
5d9b035
ec33066
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HF_TOKEN")


MODEL_INFO = [
    "Models", "Model Size(B)", "Data Source",
    "DP Acc", "DP False Positive Rate", "DP False Negative Score", "DP MCC",
    "CoT Acc", "CoT False Positive Rate", "CoT False Negative Score", "CoT MCC"
]


DATA_TITLE_TYPE = ['markdown', 'str', 'markdown',
                   'number', 'number', 'number', 'number', 'number', 'number', 'number', 'number']

SUBMISSION_NAME = "chumor_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/spaces/dnaihao/", SUBMISSION_NAME)
CSV_DIR = "./chumor_leaderboard_submission/results.csv"

COLUMN_NAMES = MODEL_INFO

LEADERBOARD_INTRODUCTION = """# Chumor Leaderboard

## Introduction
We introduce Chumor, an enhanced benchmark designed to evaluate language understanding models across broader and more challenging tasks. Building on the Massive Multitask Language Understanding (MMLU) dataset, MMLU-Pro integrates more challenging, reasoning-focused questions and increases the answer choices per question from four to ten, significantly raising the difficulty and reducing the chance of success through random guessing. MMLU-Pro comprises over 12,000 rigorously curated questions from academic exams and textbooks, spanning 14 diverse domains including Biology, Business, Chemistry, Computer Science, Economics, Engineering, Health, History, Law, Math, Philosophy, Physics, Psychology, and Others.  

Note: For inclusion in our leaderboard, submissions must provide substantial evidence demonstrating that their system is a genuine language model. We maintain strict verification standards to ensure the integrity and comparability of the results.

## What's new about MMLU-Pro

Compared to the original MMLU, there are three major differences:

- The original MMLU dataset only contains 4 options, MMLU-Pro increases it to 10 options. The increase in options will make the evaluation more realistic and challenging. The random guessing will lead to a much lower score.
- The original MMLU dataset contains mostly knowledge-driven questions without requiring much reasoning. Therefore, PPL results are normally better than CoT. In our dataset, we increase the problem difficulty and integrate more reasoning-focused problems. In MMLU-Pro, CoT can be 20% higher than PPL. 
- By increasing the distractor numbers, we significantly reduce the probability of correct guess by chance to boost the benchmark’s robustness. Specifically, with 24 different prompt styles tested, the sensitivity of model scores to prompt variations decreased from 4-5% in MMLU to just 2% in MMLU-Pro.

For detailed information about the dataset, visit our page on Hugging Face:  https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro. 

If you are interested in replicating these results or wish to evaluate your models using our dataset, access our evaluation scripts available on GitHub: https://github.com/TIGER-AI-Lab/MMLU-Pro.

If you would like to learn more details about our dataset, please check out our paper: https://arxiv.org/abs/2406.01574.

Below you can find the accuracies of different models tested on this dataset.

"""

TABLE_INTRODUCTION = """
    """

LEADERBOARD_INFO = """
## Dataset Summary
- **Questions and Options:** Each question within the dataset typically has **ten** multiple-choice options, except for some that were reduced during the manual review process to remove unreasonable choices. This increase from the original **four** options per question is designed to enhance complexity and robustness, necessitating deeper reasoning to discern the correct answer among a larger pool of potential distractors.
- **Sources:** The dataset consolidates questions from several sources:
  - **Original MMLU Questions:** Part of the dataset comes from the original MMLU dataset. We remove the trivial and ambiguous questions.
  - **STEM Website:** Hand-picking high-quality STEM problems from the Internet.
  - **TheoremQA:** High-quality human-annotated questions requiring theorems to solve.
  - **SciBench:** Science questions from college exams.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{wang2024mmlu,
  title={Mmlu-pro: A more robust and challenging multi-task language understanding benchmark},
  author={Wang, Yubo and Ma, Xueguang and Zhang, Ge and Ni, Yuansheng and Chandra, Abhranil and Guo, Shiguang and Ren, Weiming and Arulraj, Aaran and He, Xuan and Jiang, Ziyan and others},
  journal={arXiv preprint arXiv:2406.01574},
  year={2024}
}
"""

SUBMIT_INTRODUCTION = """# Submit on MMLU-Pro Leaderboard Introduction

## ⚠ Please note that you need to submit the JSON file with the following format:

```json
[
    {
        "question_id": 123,
        "question": "abc",
        "options": ["abc", "xyz", ...], 
        "answer": "ABC",
        "answer_index": 1,
        "category": "abc,
        "pred": "B",
        "model_outputs": ""
    }, ...
]
```
You can generate an output file in the above format using the evaluation script provided in our GitHub repository. For your convenience, the script and detailed instructions are available at GitHub: https://github.com/TIGER-AI-Lab/MMLU-Pro. After generating the file, please send us an email at yubo.wang.sunny@gmail.com, attaching the output file.
"""


def get_df():
    repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
    repo.git_pull()
    df = pd.read_csv(CSV_DIR)
    df['Model Size(B)'] = df['Model Size(B)'].apply(process_model_size)
    df = df.sort_values(by=['Overall'], ascending=False)
    return df


def add_new_eval(
    input_file,
):
    if input_file is None:
        return "Error! Empty file!"

    upload_data = json.loads(input_file)
    print("upload_data:\n", upload_data)
    data_row = [f'{upload_data["Model"]}', upload_data['Overall']]
    print("data_row:\n", data_row)
    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
                                 use_auth_token=HF_TOKEN, repo_type="dataset")
    submission_repo.git_pull()

    already_submitted = []
    with open(CSV_DIR, mode='r') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:
            already_submitted.append(row[0])

    if data_row[0] not in already_submitted:
        with open(CSV_DIR, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(data_row)
        
        submission_repo.push_to_hub()
        print('Submission Successful')
    else:
        print('The entry already exists')

def refresh_data():
    df = get_df()
    return df[COLUMN_NAMES]


def search_and_filter_models(df, query, min_size, max_size):
    filtered_df = df.copy()
    
    if query:
        filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]

    size_mask = filtered_df['Model Size(B)'].apply(lambda x: 
        (min_size <= 1000.0 <= max_size) if x == 'unknown' 
        else (min_size <= x <= max_size))
    
    filtered_df = filtered_df[size_mask]
    
    return filtered_df[COLUMN_NAMES]


# def search_and_filter_models(df, query, min_size, max_size):
#     filtered_df = df.copy()

#     if query:
#         filtered_df = filtered_df[filtered_df['Models'].str.contains(query, case=False, na=False)]
    
#     def size_filter(x):
#         if isinstance(x, (int, float)):
#             return min_size <= x <= max_size
#         return True 
    
#     filtered_df = filtered_df[filtered_df['Model Size(B)'].apply(size_filter)]
    
#     return filtered_df[COLUMN_NAMES]


def search_models(df, query):
    if query:
        return df[df['Models'].str.contains(query, case=False, na=False)]
    return df


# def get_size_range(df):
#     numeric_sizes = df[df['Model Size(B)'].apply(lambda x: isinstance(x, (int, float)))]['Model Size(B)']
#     if len(numeric_sizes) > 0:
#         return float(numeric_sizes.min()), float(numeric_sizes.max())
#     return 0, 1000


def get_size_range(df):
    sizes = df['Model Size(B)'].apply(lambda x: 1000.0 if x == 'unknown' else x)
    return float(sizes.min()), float(sizes.max())


def process_model_size(size):
    if pd.isna(size) or size == 'unk' or size == "-":
        return 'unknown'
    try:
        val = float(size)
        return val
    except (ValueError, TypeError):
        return 'unknown'


def filter_columns_by_subjects(df, selected_subjects=None):
    if selected_subjects is None or len(selected_subjects) == 0:
        return df[COLUMN_NAMES]
    
    base_columns = ['Models', 'Model Size(B)', 'Data Source', 'DP Acc']
    selected_columns = base_columns + selected_subjects
    
    available_columns = [col for col in selected_columns if col in df.columns]
    return df[available_columns]