Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	Update utils.py
Browse files
    	
        utils.py
    CHANGED
    
    | 
         @@ -8,42 +8,103 @@ from huggingface_hub import Repository 
     | 
|
| 8 | 
         | 
| 9 | 
         
             
            HF_TOKEN = os.environ.get("HF_TOKEN")
         
     | 
| 10 | 
         | 
| 11 | 
         
            -
            SUBJECTS = ["Biology", "Business", "Chemistry", "Computer Science", "Economics", "Engineering",
         
     | 
| 12 | 
         
            -
                        "Health", "History", "Law", "Math", "Philosophy", "Physics", "Psychology", "Other"]
         
     | 
| 13 | 
         
            -
             
     | 
| 14 | 
         
             
            MODEL_INFO = [
         
     | 
| 15 | 
         
            -
                " 
     | 
| 16 | 
         
            -
                " 
     | 
| 17 | 
         
            -
                " 
     | 
| 18 | 
         
            -
                " 
     | 
| 19 | 
         
            -
             
     | 
| 20 | 
         
            -
             
     | 
| 21 | 
         
            -
             
     | 
| 22 | 
         
            -
             
     | 
| 23 | 
         
            -
             
     | 
| 24 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 25 | 
         
             
            SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
         
     | 
| 26 | 
         
            -
            CSV_DIR = "./ 
     | 
| 27 | 
         | 
| 28 | 
         
             
            COLUMN_NAMES = MODEL_INFO
         
     | 
| 29 | 
         | 
| 30 | 
         
            -
             
     | 
| 31 | 
         
            -
             
     | 
| 32 | 
         
            -
             
     | 
| 33 | 
         
            -
             
     | 
| 34 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 35 | 
         
             
                """
         
     | 
| 36 | 
         | 
| 37 | 
         
             
            TABLE_INTRODUCTION = """
         
     | 
| 38 | 
         
             
                """
         
     | 
| 39 | 
         | 
| 40 | 
         
            -
             
     | 
| 41 | 
         
             
            We list the information of the used datasets as follows:<br>
         
     | 
| 42 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 43 | 
         
             
            """
         
     | 
| 44 | 
         | 
| 45 | 
         
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         
     | 
| 46 | 
         
            -
            CITATION_BUTTON_TEXT = r""" 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 47 | 
         | 
| 48 | 
         
             
            SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
         
     | 
| 49 | 
         | 
| 
         @@ -51,40 +112,36 @@ SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction 
     | 
|
| 51 | 
         | 
| 52 | 
         
             
            ```json
         
     | 
| 53 | 
         
             
            {
         
     | 
| 54 | 
         
            -
                "Model": "[ 
     | 
| 55 | 
         
            -
                " 
     | 
| 56 | 
         
            -
                " 
     | 
| 57 | 
         
            -
                " 
     | 
| 58 | 
         
            -
                 
     | 
| 59 | 
         
            -
                " 
     | 
| 
         | 
|
| 60 | 
         
             
            }
         
     | 
| 61 | 
         
             
            ```
         
     | 
| 62 | 
         
            -
            After submitting, you can click the "Refresh" button to see the updated leaderboard 
     | 
| 63 | 
         | 
| 64 | 
         
             
            """
         
     | 
| 65 | 
         
            -
             
     | 
| 66 | 
         
            -
             
     | 
| 67 | 
         
             
            def get_df():
         
     | 
| 68 | 
         
             
                repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
         
     | 
| 69 | 
         
             
                repo.git_pull()
         
     | 
| 70 | 
         
             
                df = pd.read_csv(CSV_DIR)
         
     | 
| 71 | 
         
            -
                df = df 
     | 
| 
         | 
|
| 72 | 
         
             
                return df[COLUMN_NAMES]
         
     | 
| 73 | 
         | 
| 74 | 
         
            -
             
     | 
| 75 | 
         
             
            def add_new_eval(
         
     | 
| 76 | 
         
             
                input_file,
         
     | 
| 77 | 
         
             
            ):
         
     | 
| 78 | 
         
             
                if input_file is None:
         
     | 
| 79 | 
         
             
                    return "Error! Empty file!"
         
     | 
| 80 | 
         | 
| 81 | 
         
            -
                upload_data 
     | 
| 82 | 
         
            -
                data_row = [f'{upload_data["Model"]}', upload_data[' 
     | 
| 83 | 
         
            -
                for subject in SUBJECTS:
         
     | 
| 84 | 
         
            -
                    data_row += [upload_data[subject]]
         
     | 
| 85 | 
         | 
| 86 | 
         
            -
                submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL,
         
     | 
| 87 | 
         
            -
                                             use_auth_token=HF_TOKEN, repo_type="dataset")
         
     | 
| 88 | 
         
             
                submission_repo.git_pull()
         
     | 
| 89 | 
         | 
| 90 | 
         
             
                already_submitted = []
         
     | 
| 
         @@ -105,7 +162,4 @@ def add_new_eval( 
     | 
|
| 105 | 
         | 
| 106 | 
         | 
| 107 | 
         
             
            def refresh_data():
         
     | 
| 108 | 
         
            -
                return get_df()
         
     | 
| 109 | 
         
            -
             
     | 
| 110 | 
         
            -
             
     | 
| 111 | 
         
            -
             
     | 
| 
         | 
|
| 8 | 
         | 
| 9 | 
         
             
            HF_TOKEN = os.environ.get("HF_TOKEN")
         
     | 
| 10 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 11 | 
         
             
            MODEL_INFO = [
         
     | 
| 12 | 
         
            +
                "Model (CoT)",
         
     | 
| 13 | 
         
            +
                "Avg",
         
     | 
| 14 | 
         
            +
                "TheoremQA",
         
     | 
| 15 | 
         
            +
                "MATH",
         
     | 
| 16 | 
         
            +
                "GSM",
         
     | 
| 17 | 
         
            +
                "GPQA",
         
     | 
| 18 | 
         
            +
                "MMLU-STEM"
         
     | 
| 19 | 
         
            +
                ]
         
     | 
| 20 | 
         
            +
             
     | 
| 21 | 
         
            +
            DATA_TITILE_TYPE = ['markdown', 'number', 'number', 'number', 'number', 'number', 'number']
         
     | 
| 22 | 
         
            +
             
     | 
| 23 | 
         
            +
            SUBMISSION_NAME = "science_leaderboard_submission"
         
     | 
| 24 | 
         
             
            SUBMISSION_URL = os.path.join("https://huggingface.co/datasets/TIGER-Lab/", SUBMISSION_NAME)
         
     | 
| 25 | 
         
            +
            CSV_DIR = "./science_leaderboard_submission/results.csv"
         
     | 
| 26 | 
         | 
| 27 | 
         
             
            COLUMN_NAMES = MODEL_INFO
         
     | 
| 28 | 
         | 
| 29 | 
         
            +
            LEADERBORAD_INTRODUCTION = """# Science Leaderboard
         
     | 
| 30 | 
         
            +
                
         
     | 
| 31 | 
         
            +
                **"Which large language model is the BEST on scinece and engineering?"**<br>
         
     | 
| 32 | 
         
            +
                🏆 Welcome to the **Science** leaderboard! The leaderboard covers the most popular evaluation for different science subjects including math, phyiscs, biology, chemistry, computer science, finance.
         
     | 
| 33 | 
         
            +
                <div style="display: flex; flex-wrap: wrap; align-items: center; gap: 10px;">
         
     | 
| 34 | 
         
            +
                </div>
         
     | 
| 35 | 
         
            +
                The evaluation set from the following datasets are being included in the leaderboard.
         
     | 
| 36 | 
         
            +
                <ul>
         
     | 
| 37 | 
         
            +
                    <li> MATH (4-shot): this contains the test set of 5000 questions from American Math contest covering different fields like algebra, calculus, statistics, geometry, linear algebra, number theory.
         
     | 
| 38 | 
         
            +
                    <li> GSM8K (4-shot): this contains the test set of 1320 questions from grade school math word problems. This dataset is mainly covering algebra problems.
         
     | 
| 39 | 
         
            +
                    <li> TheoremQA (5-shot): this contains the test set of 800 questions collected from college-level exams. This covers math, physics, engineering and finance.
         
     | 
| 40 | 
         
            +
                    <li> GPQA (5-shot): this contains the test of 198 questions from college-level dataset GPQA-diamond. This covers many fields like chemistry, genetics, biology, etc.
         
     | 
| 41 | 
         
            +
                    <li> MMLU-STEM (5-shot): this contains the test of 3.3K questions from MMLU dataset. This covers many fields like math, chemistry, genetics, biology, computer science, anatomy, astronomy, etc.
         
     | 
| 42 | 
         
            +
                </ul>
         
     | 
| 43 | 
         
            +
             
     | 
| 44 | 
         
            +
                **"How to evaluate your model and submit your results?"**<br>
         
     | 
| 45 | 
         
            +
                Please refer to the guideline in <a href="https://github.com/TIGER-AI-Lab/MAmmoTH/blob/main/math_eval/README.md">Github</a> to evaluate your own model.
         
     | 
| 46 | 
         
            +
             
     | 
| 47 | 
         
            +
                <a href='https://hits.seeyoufarm.com'><img src='https://hits.seeyoufarm.com/api/count/incr/badge.svg?url=https%3A%2F%2Fhuggingface.co%2Fspaces%2FTIGER-Lab%2FTheoremQA-Leaderboard&count_bg=%23C7C83D&title_bg=%23555555&icon=&icon_color=%23E7E7E7&title=hits&edge_flat=false'></a>
         
     | 
| 48 | 
         
             
                """
         
     | 
| 49 | 
         | 
| 50 | 
         
             
            TABLE_INTRODUCTION = """
         
     | 
| 51 | 
         
             
                """
         
     | 
| 52 | 
         | 
| 53 | 
         
            +
            LEADERBORAD_INFO = """
         
     | 
| 54 | 
         
             
            We list the information of the used datasets as follows:<br>
         
     | 
| 55 | 
         | 
| 56 | 
         
            +
            MATH: Measuring Mathematical Problem Solving With the MATH Dataset<br>
         
     | 
| 57 | 
         
            +
            <a href='https://arxiv.org/pdf/2103.03874.pdf'>Paper</a><br>
         
     | 
| 58 | 
         
            +
            <a href='https://github.com/hendrycks/math'>Code</a><br>
         
     | 
| 59 | 
         
            +
             
     | 
| 60 | 
         
            +
            GSM8K: Training Verifiers to Solve Math Word Problems<br>
         
     | 
| 61 | 
         
            +
            <a href='https://arxiv.org/pdf/2110.14168.pdf'>Paper</a><br>
         
     | 
| 62 | 
         
            +
            <a href='https://github.com/openai/grade-school-math'>Code</a><br>
         
     | 
| 63 | 
         
            +
             
     | 
| 64 | 
         
            +
            TheoremQA: A Theorem-driven Question Answering dataset<br>
         
     | 
| 65 | 
         
            +
            <a href='https://arxiv.org/pdf/2305.12524.pdf'>Paper</a><br>
         
     | 
| 66 | 
         
            +
            <a href='https://github.com/TIGER-AI-Lab/TheoremQA'>Code</a><br>
         
     | 
| 67 | 
         
            +
             
     | 
| 68 | 
         
            +
            GPQA: A Graduate-Level Google-Proof Q&A Benchmark<br>
         
     | 
| 69 | 
         
            +
            <a href='https://arxiv.org/pdf/2311.12022.pdf'>Paper</a><br>
         
     | 
| 70 | 
         
            +
            <a href='https://github.com/idavidrein/gpqa'>Code</a>
         
     | 
| 71 | 
         
            +
             
     | 
| 72 | 
         
            +
            MMLU: Measuring Massive Multitask Language Understanding<br>
         
     | 
| 73 | 
         
            +
            <a href='https://arxiv.org/pdf/2009.03300.pdf'>Paper</a><br>
         
     | 
| 74 | 
         
            +
            <a href='https://github.com/hendrycks/test'>Code</a>
         
     | 
| 75 | 
         
             
            """
         
     | 
| 76 | 
         | 
| 77 | 
         
             
            CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
         
     | 
| 78 | 
         
            +
            CITATION_BUTTON_TEXT = r"""@inproceedings{hendrycks2021measuring,
         
     | 
| 79 | 
         
            +
              title={Measuring Mathematical Problem Solving With the MATH Dataset},
         
     | 
| 80 | 
         
            +
              author={Hendrycks, Dan and Burns, Collin and Kadavath, Saurav and Arora, Akul and Basart, Steven and Tang, Eric and Song, Dawn and Steinhardt, Jacob},
         
     | 
| 81 | 
         
            +
              booktitle={Thirty-fifth Conference on Neural Information Processing Systems Datasets and Benchmarks Track (Round 2)},
         
     | 
| 82 | 
         
            +
              year={2021}
         
     | 
| 83 | 
         
            +
            }
         
     | 
| 84 | 
         
            +
            @article{cobbe2021training,
         
     | 
| 85 | 
         
            +
              title={Training verifiers to solve math word problems},
         
     | 
| 86 | 
         
            +
              author={Cobbe, Karl and Kosaraju, Vineet and Bavarian, Mohammad and Chen, Mark and Jun, Heewoo and Kaiser, Lukasz and Plappert, Matthias and Tworek, Jerry and Hilton, Jacob and Nakano, Reiichiro and others},
         
     | 
| 87 | 
         
            +
              journal={arXiv preprint arXiv:2110.14168},
         
     | 
| 88 | 
         
            +
              year={2021}
         
     | 
| 89 | 
         
            +
            }
         
     | 
| 90 | 
         
            +
            @inproceedings{chen2023theoremqa,
         
     | 
| 91 | 
         
            +
              title={Theoremqa: A theorem-driven question answering dataset},
         
     | 
| 92 | 
         
            +
              author={Chen, Wenhu and Yin, Ming and Ku, Max and Lu, Pan and Wan, Yixin and Ma, Xueguang and Xu, Jianyu and Wang, Xinyi and Xia, Tony},
         
     | 
| 93 | 
         
            +
              booktitle={The 2023 Conference on Empirical Methods in Natural Language Processing},
         
     | 
| 94 | 
         
            +
              year={2023}
         
     | 
| 95 | 
         
            +
            }
         
     | 
| 96 | 
         
            +
            @article{rein2023gpqa,
         
     | 
| 97 | 
         
            +
              title={Gpqa: A graduate-level google-proof q\&a benchmark},
         
     | 
| 98 | 
         
            +
              author={Rein, David and Hou, Betty Li and Stickland, Asa Cooper and Petty, Jackson and Pang, Richard Yuanzhe and Dirani, Julien and Michael, Julian and Bowman, Samuel R},
         
     | 
| 99 | 
         
            +
              journal={arXiv preprint arXiv:2311.12022},
         
     | 
| 100 | 
         
            +
              year={2023}
         
     | 
| 101 | 
         
            +
            }
         
     | 
| 102 | 
         
            +
            @inproceedings{hendrycks2020measuring,
         
     | 
| 103 | 
         
            +
              title={Measuring Massive Multitask Language Understanding},
         
     | 
| 104 | 
         
            +
              author={Hendrycks, Dan and Burns, Collin and Basart, Steven and Zou, Andy and Mazeika, Mantas and Song, Dawn and Steinhardt, Jacob},
         
     | 
| 105 | 
         
            +
              booktitle={International Conference on Learning Representations},
         
     | 
| 106 | 
         
            +
              year={2020}
         
     | 
| 107 | 
         
            +
            }"""
         
     | 
| 108 | 
         | 
| 109 | 
         
             
            SUBMIT_INTRODUCTION = """# Submit on Science Leaderboard Introduction
         
     | 
| 110 | 
         | 
| 
         | 
|
| 112 | 
         | 
| 113 | 
         
             
            ```json
         
     | 
| 114 | 
         
             
            {
         
     | 
| 115 | 
         
            +
                "Model": "[NAME]",
         
     | 
| 116 | 
         
            +
                "Repo": "https://huggingface.co/[MODEL_NAME]"
         
     | 
| 117 | 
         
            +
                "TheoremQA": 50,
         
     | 
| 118 | 
         
            +
                "MATH": 50,
         
     | 
| 119 | 
         
            +
                "GSM": 50,
         
     | 
| 120 | 
         
            +
                "GPQA": 50,
         
     | 
| 121 | 
         
            +
                "MMLU-STEM": 50
         
     | 
| 122 | 
         
             
            }
         
     | 
| 123 | 
         
             
            ```
         
     | 
| 124 | 
         
            +
            After submitting, you can click the "Refresh" button to see the updated leaderboard(it may takes few seconds).
         
     | 
| 125 | 
         | 
| 126 | 
         
             
            """
         
     | 
| 
         | 
|
| 
         | 
|
| 127 | 
         
             
            def get_df():
         
     | 
| 128 | 
         
             
                repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN)
         
     | 
| 129 | 
         
             
                repo.git_pull()
         
     | 
| 130 | 
         
             
                df = pd.read_csv(CSV_DIR)
         
     | 
| 131 | 
         
            +
                df['Avg'] = df[['TheoremQA', 'MATH', 'GSM', 'GPQA', 'MMLU-STEM']].mean(axis=1).round(1)
         
     | 
| 132 | 
         
            +
                df = df.sort_values(by=['Avg'], ascending=False)
         
     | 
| 133 | 
         
             
                return df[COLUMN_NAMES]
         
     | 
| 134 | 
         | 
| 
         | 
|
| 135 | 
         
             
            def add_new_eval(
         
     | 
| 136 | 
         
             
                input_file,
         
     | 
| 137 | 
         
             
            ):
         
     | 
| 138 | 
         
             
                if input_file is None:
         
     | 
| 139 | 
         
             
                    return "Error! Empty file!"
         
     | 
| 140 | 
         | 
| 141 | 
         
            +
                upload_data=json.loads(input_file)
         
     | 
| 142 | 
         
            +
                data_row = [f'[{upload_data["Model"]}]({upload_data["Repo"]})', upload_data['TheoremQA'], upload_data['MATH'], upload_data['GSM'], upload_data['GPQA'], upload_data['MMLU-STEM']]
         
     | 
| 
         | 
|
| 
         | 
|
| 143 | 
         | 
| 144 | 
         
            +
                submission_repo = Repository(local_dir=SUBMISSION_NAME, clone_from=SUBMISSION_URL, use_auth_token=HF_TOKEN, repo_type="dataset")
         
     | 
| 
         | 
|
| 145 | 
         
             
                submission_repo.git_pull()
         
     | 
| 146 | 
         | 
| 147 | 
         
             
                already_submitted = []
         
     | 
| 
         | 
|
| 162 | 
         | 
| 163 | 
         | 
| 164 | 
         
             
            def refresh_data():
         
     | 
| 165 | 
         
            +
                return get_df()
         
     | 
| 
         | 
|
| 
         | 
|
| 
         |