Spaces:

TIGER-Lab
/

Science-Leaderboard

Running

App Files Files Community

wenhuchen commited on Apr 17

Commit

01877fc

•

1 Parent(s): bb81b02

adding MMLU-STEM

Browse files

Files changed (1) hide show

utils.py +19 -5

utils.py CHANGED Viewed

@@ -15,9 +15,10 @@ MODEL_INFO = [
     "MATH",
     "GSM",
     "GPQA",
     ]
-DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number']
 SUBMISSION_NAME = "science_leaderboard_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
@@ -37,6 +38,7 @@ LEADERBORAD_INTRODUCTION = """# Science Leaderboard
         <li> GSM8K (4-shot): this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
         <li> TheoremQA (5-shot): this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
         <li> GPQA (5-shot): this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
     </ul>
     **"How to evaluate your model and submit your results?"**<br>
@@ -66,6 +68,10 @@ TheoremQA: A Theorem-driven Question Answering dataset<br>
 GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
 <a href='https://arxiv.org/pdf/2311.12022.pdf'>Paper</a><br>
 <a href='https://github.com/idavidrein/gpqa'>Code</a>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
@@ -92,6 +98,12 @@ CITATION_BUTTON_TEXT = r"""@inproceedings{hendrycks2021measuring,
   author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
   journal={arXiv preprint arXiv:2311.12022},
   year={2023}
 }"""
 SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
@@ -104,8 +116,9 @@ SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
     "Repo": "https://huggingface.co/[MODEL_NAME]"
     "TheoremQA": 50,
     "MATH": 50,
-    "GSM": 50
-    "GPQA": 50
 }
 ```
 After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).
@@ -115,7 +128,8 @@ def get_df():
     repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
     repo.git_pull()
     df = pd.read_csv(CSV_DIR)
-    df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA']].mean(axis=1).round(1)
     df = df.sort_values(by=['Avg'], ascending=False)
     return df[COLUMN_NAMES]
@@ -126,7 +140,7 @@ def add_new_eval(
         return "Error! Empty file!"
     upload_data=json.loads(input_file)
-    data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA']]
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
     submission_repo.git_pull()

     "MATH",
     "GSM",
     "GPQA",
+    "MMLU-STEM"
     ]
+DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number']
 SUBMISSION_NAME = "science_leaderboard_submission"
 SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
         <li> GSM8K (4-shot): this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
         <li> TheoremQA (5-shot): this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
         <li> GPQA (5-shot): this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
+        <li> MMLU-STEM (5-shot): this contains the test of 3.3K questions from MMLU dataset. This covers many fields like math, chemistry, genetics, biology, computer science, anatomy, astronomy, etc.
     </ul>
     **"How to evaluate your model and submit your results?"**<br>
 GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
 <a href='https://arxiv.org/pdf/2311.12022.pdf'>Paper</a><br>
 <a href='https://github.com/idavidrein/gpqa'>Code</a>
+MMLU: Measuring Massive Multitask Language Understanding<br>
+<a href='https://arxiv.org/pdf/2009.03300.pdf'>Paper</a><br>
+<a href='https://github.com/hendrycks/test'>Code</a>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
   author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
   journal={arXiv preprint arXiv:2311.12022},
   year={2023}
+}
+@inproceedings{hendrycks2020measuring,
+  title={Measuring Massive Multitask Language Understanding},
+  author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
+  booktitle={International Conference on Learning Representations},
+  year={2020}
 }"""
 SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
     "Repo": "https://huggingface.co/[MODEL_NAME]"
     "TheoremQA": 50,
     "MATH": 50,
+    "GSM": 50,
+    "GPQA": 50,
+    "MMLU-STEM": 50
 }
 ```
 After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).
     repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
     repo.git_pull()
     df = pd.read_csv(CSV_DIR)
+    print(df)
+    df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA', 'MMLU-STEM']].mean(axis=1).round(1)
     df = df.sort_values(by=['Avg'], ascending=False)
     return df[COLUMN_NAMES]
         return "Error! Empty file!"
     upload_data=json.loads(input_file)
+    data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA'], upload_data['MMLU-STEM']]
     submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
     submission_repo.git_pull()