File size: 7,262 Bytes
6590844
 
 
 
 
 
 
 
 
 
6ed5ca9
6590844
 
6ed5ca9
 
 
236a68e
01877fc
6ed5ca9
 
01877fc
6ed5ca9
6590844
af70d14
6590844
6ed5ca9
 
 
bd5120e
6ed5ca9
b6fb488
bd5120e
6ed5ca9
 
bd5120e
 
b6fb488
 
 
 
01877fc
bd5120e
b6fb488
 
 
 
bd5120e
6ed5ca9
 
 
 
 
 
bd5120e
 
4abf394
 
 
 
 
 
 
 
 
 
 
 
 
 
3c4d070
01877fc
 
 
 
6ed5ca9
 
 
236a68e
bd5120e
 
236a68e
 
bd5120e
 
 
 
 
 
 
 
6ed5ca9
 
 
 
bd5120e
 
 
 
 
 
01877fc
 
 
 
 
 
6ed5ca9
6590844
bd5120e
6590844
 
 
 
 
bb81b02
 
 
 
01877fc
 
 
6590844
 
 
 
 
 
 
 
 
01877fc
900d902
6590844
 
 
 
 
 
 
 
 
01877fc
6590844
 
 
 
bb81b02
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6590844
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import pandas as pd
import gradio as gr
import csv
import json
import os
import shutil
from huggingface_hub import Repository

HF_TOKEN = os.environ.get("HF_TOKEN")

MODEL_INFO = [
    "Model (CoT)",
    "Avg",
    "TheoremQA",
    "MATH",
    "GSM",
    "GPQA",
    "MMLU-STEM"
    ]

DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number']

SUBMISSION_NAME = "science_leaderboard_submission"
SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/wenhu/", SUBMISSION_NAME)
CSV_DIR = "./science_leaderboard_submission/results.csv"

COLUMN_NAMES = MODEL_INFO

LEADERBORAD_INTRODUCTION = """# Science Leaderboard
    
    **"Which large language model is the BEST on scinece and engineering?"**<br>
    🏆 Welcome to the **Science** leaderboard! The leaderboard covers the most popular evaluation for different science subjects including math, phyiscs, biology, chemistry, computer science, finance.
    <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
    </div>
    The evaluation set from the following datasets are being included in the leaderboard.
    <ul>
        <li> MATH (4-shot): this contains the test set of 5000 questions from American Math contest covering different fields like algebra, calculus, statistics, geometry, linear algebra, number theory.
        <li> GSM8K (4-shot): this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
        <li> TheoremQA (5-shot): this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
        <li> GPQA (5-shot): this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
        <li> MMLU-STEM (5-shot): this contains the test of 3.3K questions from MMLU dataset. This covers many fields like math, chemistry, genetics, biology, computer science, anatomy, astronomy, etc.
    </ul>

    **"How to evaluate your model and submit your results?"**<br>
    Please refer to the guideline in <a href="https://github.com/TIGER-AI-Lab/MAmmoTH/blob/main/math_eval/README.md">Github</a> to evaluate your own model.

    <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
    """

TABLE_INTRODUCTION = """
    """

LEADERBORAD_INFO = """
We list the information of the used datasets as follows:<br>

MATH: Measuring Mathematical Problem Solving With the MATH Dataset<br>
<a href='https://arxiv.org/pdf/2103.03874.pdf'>Paper</a><br>
<a href='https://github.com/hendrycks/math'>Code</a><br>

GSM8K: Training Verifiers to Solve Math Word Problems<br>
<a href='https://arxiv.org/pdf/2110.14168.pdf'>Paper</a><br>
<a href='https://github.com/openai/grade-school-math'>Code</a><br>

TheoremQA: A Theorem-driven Question Answering dataset<br>
<a href='https://arxiv.org/pdf/2305.12524.pdf'>Paper</a><br>
<a href='https://github.com/TIGER-AI-Lab/TheoremQA'>Code</a><br>

GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
<a href='https://arxiv.org/pdf/2311.12022.pdf'>Paper</a><br>
<a href='https://github.com/idavidrein/gpqa'>Code</a>

MMLU: Measuring Massive Multitask Language Understanding<br>
<a href='https://arxiv.org/pdf/2009.03300.pdf'>Paper</a><br>
<a href='https://github.com/hendrycks/test'>Code</a>
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@inproceedings{hendrycks2021measuring,
  title={Measuring Mathematical Problem Solving With the MATH Dataset},
  author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
  booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
  year={2021}
}
@article{cobbe2021training,
  title={Training verifiers to solve math word problems},
  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
  journal={arXiv preprint arXiv:2110.14168},
  year={2021}
}
@inproceedings{chen2023theoremqa,
  title={Theoremqa: A theorem-driven question answering dataset},
  author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
  booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
  year={2023}
}
@article{rein2023gpqa,
  title={Gpqa: A graduate-level google-proof q\&a benchmark},
  author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
  journal={arXiv preprint arXiv:2311.12022},
  year={2023}
}
@inproceedings{hendrycks2020measuring,
  title={Measuring Massive Multitask Language Understanding},
  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
  booktitle={International Conference on Learning Representations},
  year={2020}
}"""

SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction

## ⚠ Please note that you need to submit the json file with following format:

```json
{
    "Model": "[NAME]",
    "Repo": "https://huggingface.co/[MODEL_NAME]"
    "TheoremQA": 50,
    "MATH": 50,
    "GSM": 50,
    "GPQA": 50,
    "MMLU-STEM": 50
}
```
After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).

"""
def get_df():
    repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
    repo.git_pull()
    df = pd.read_csv(CSV_DIR)
    df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA', 'MMLU-STEM']].mean(axis=1).round(1)
    df = df.sort_values(by=['Avg'], ascending=False)
    return df[COLUMN_NAMES]

def add_new_eval(
    input_file,
):
    if input_file is None:
        return "Error! Empty file!"

    upload_data=json.loads(input_file)
    data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA'], upload_data['MMLU-STEM']]

    submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
    submission_repo.git_pull()

    already_submitted = []
    with open(CSV_DIR, mode='r') as file:
        reader = csv.reader(file, delimiter=',')
        for row in reader:
            already_submitted.append(row[0])

    if data_row[0] not in already_submitted:
        with open(CSV_DIR, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(data_row)
        
        submission_repo.push_to_hub()
        print('Submission Successful')
    else:
        print('The entry already exists')


def refresh_data():
    return get_df()