Spaces:

TIGER-Lab
/

Science-Leaderboard

Running

App Files Files Community

wenhu commited on Apr 12

Commit

bd5120e

•

1 Parent(s): 900d902

update interface

Browse files

Files changed (1) hide show

utils.py +48 -13

utils.py CHANGED Viewed

@@ -24,37 +24,71 @@ CSV_DIR = "./science_leaderboard_submission/results.csv"
 COLUMN_NAMES = MODEL_INFO
-LEADERBORAD_INTRODUCTION = """# TheoremQA Leaderboard
-    *"Which Model is better on STEM QA?"*
-    🏆 Welcome to the leaderboard of the **TheoremQA**! 🎦 *A Theorem-driven Question Answering dataset* (**EMNLP 2023**)
     <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
-    <a href='https://arxiv.org/abs/2305.12524'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
-    <a href='https://github.com/TIGER-AI-Lab/TheoremQA'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
-    <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
     </div>
-    We propose the first question-answering dataset driven by STEM theorems.  We annotated 800 QA pairs covering 350+ theorems spanning across Math, EE&CS, Physics and Finance.  The dataset is collected by human experts with very high quality.  We provide the dataset as a new benchmark to test the limit of large language models to apply theorems to solve challenging university-level questions.
-    Please follow the instructions in [TheoremQA](https://github.com/TIGER-AI-Lab/TheoremQA) to use.
     """
 TABLE_INTRODUCTION = """
     """
 LEADERBORAD_INFO = """
-       TheoremQA, a comprehensive benchmark suite for video generative models. We design a comprehensive and hierarchical Evaluation Dimension Suite to decompose "video generation quality" into multiple well-defined dimensions to facilitate fine-grained and objective evaluation. For each dimension and each content category, we carefully design a Prompt Suite as test cases, and sample Generated Videos from a set of video generation models. For each evaluation dimension, we specifically design an Evaluation Method Suite, which uses carefully crafted method or designated pipeline for automatic objective evaluation. We also conduct Human Preference Annotation for the generated videos for each dimension, and show that TheoremQA evaluation results are well aligned with human perceptions. TheoremQA can provide valuable insights from multiple perspectives.
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
-CITATION_BUTTON_TEXT = r"""@inproceedings{chen2023theoremqa,
   title={Theoremqa: A theorem-driven question answering dataset},
   author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
   booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
   year={2023}
 }"""
-SUBMIT_INTRODUCTION = """# Submit on TheoremQA Leaderboard Introduction
 ## ⚠ Please note that you need to submit the json file with following format:
@@ -64,6 +98,7 @@ SUBMIT_INTRODUCTION = """# Submit on TheoremQA Leaderboard Introduction
     "TheoremQA": 0.5,
     "MATH": 0.5,
     "GSM": 0.5
 }
 ```
 After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).

 COLUMN_NAMES = MODEL_INFO
+LEADERBORAD_INTRODUCTION = """# Science Leaderboard
+    *"Which large language model is the BEST on scinece and engineering?"*
+    🏆 Welcome to the **Science** leaderboard! The leaderboard covers the most popular evaluation for different science subjects including math, phyiscs, biology, chemistry, computer science, finance.
     <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
     </div>
+    The evaluation set from the following datasets are being included in the leaderboard.
+    <ul>
+        <li> MATH: this contains the test set of 5000 questions from American Math contest covering different fields like algebra, calculus, statistics, geometry, linear algebra, number theory.
+        <li> GSM8K: this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
+        <li> TheoremQA: this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
+        <li> GPQA: this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
+    </ul>
+    <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
     """
 TABLE_INTRODUCTION = """
     """
 LEADERBORAD_INFO = """
+We list the information of the used datasets as follows:<br>
+MATH: Measuring Mathematical Problem Solving With the MATH Dataset
+<a href='https://arxiv.org/pdf/2103.03874.pdf'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+<a href='https://github.com/hendrycks/math'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
+GSM8K: Training Verifiers to Solve Math Word Problems
+<a href='https://arxiv.org/pdf/2110.14168.pdf'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+<a href='https://github.com/openai/grade-school-math'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
+TheoremQA: TheoremQA: A Theorem-driven Question Answering dataset
+<a href='https://arxiv.org/pdf/2305.12524.pdf'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+<a href='https://github.com/TIGER-AI-Lab/TheoremQA'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
+GPQA: A Graduate-Level Google-Proof Q&A Benchmark
+<a href='https://arxiv.org/pdf/2311.12022.pdf'><img src='https://img.shields.io/badge/cs.CV-Paper-b31b1b?logo=arxiv&logoColor=red'></a>
+<a href='https://github.com/idavidrein/gpqa'><img src='https://img.shields.io/badge/Github-Repo-grey?logo=github&logoColor=white'></a>
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
+CITATION_BUTTON_TEXT = r"""@article{hendrycks2measuring,
+  title={Measuring Mathematical Problem Solving With the MATH Dataset},
+  author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
+  journal={Sort},
+  volume={2},
+  number={4},
+  pages={0--6}
+}
+@article{cobbe2021training,
+  title={Training verifiers to solve math word problems},
+  author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
+  journal={arXiv preprint arXiv:2110.14168},
+  year={2021}
+}
+@inproceedings{chen2023theoremqa,
   title={Theoremqa: A theorem-driven question answering dataset},
   author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
   booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
   year={2023}
+}
+@article{rein2023gpqa,
+  title={Gpqa: A graduate-level google-proof q\&a benchmark},
+  author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
+  journal={arXiv preprint arXiv:2311.12022},
+  year={2023}
 }"""
+SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
 ## ⚠ Please note that you need to submit the json file with following format:
     "TheoremQA": 0.5,
     "MATH": 0.5,
     "GSM": 0.5
+    "GPQA": 0.5
 }
 ```
 After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).