Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						ed1fdef
	
1
								Parent(s):
							
							6fefae4
								
added 'forbidden models' submission, to allow orgs to request their models to not be submitted in case of contamination
Browse files
    	
        app.py
    CHANGED
    
    | @@ -10,7 +10,7 @@ from apscheduler.schedulers.background import BackgroundScheduler | |
| 10 | 
             
            from huggingface_hub import HfApi
         | 
| 11 | 
             
            from transformers import AutoConfig
         | 
| 12 |  | 
| 13 | 
            -
            from src.auto_leaderboard.get_model_metadata import apply_metadata
         | 
| 14 | 
             
            from src.assets.text_content import *
         | 
| 15 | 
             
            from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
         | 
| 16 | 
             
            from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
         | 
| @@ -227,9 +227,13 @@ def add_new_eval( | |
| 227 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 228 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
         | 
| 229 |  | 
|  | |
|  | |
|  | |
|  | |
| 230 | 
             
                # Check for duplicate submission
         | 
| 231 | 
             
                if out_path.split("eval-queue/")[1].lower() in requested_models:
         | 
| 232 | 
            -
                    return styled_warning("This model has been already submitted.")
         | 
| 233 |  | 
| 234 | 
             
                with open(out_path, "w") as f:
         | 
| 235 | 
             
                    f.write(json.dumps(eval_entry))
         | 
|  | |
| 10 | 
             
            from huggingface_hub import HfApi
         | 
| 11 | 
             
            from transformers import AutoConfig
         | 
| 12 |  | 
| 13 | 
            +
            from src.auto_leaderboard.get_model_metadata import apply_metadata, DO_NOT_SUBMIT_MODELS
         | 
| 14 | 
             
            from src.assets.text_content import *
         | 
| 15 | 
             
            from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
         | 
| 16 | 
             
            from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
         | 
|  | |
| 227 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 228 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
         | 
| 229 |  | 
| 230 | 
            +
                # Check if the model has been forbidden:
         | 
| 231 | 
            +
                if out_path.split("eval-queue/")[1] in DO_NOT_SUBMIT_MODELS:
         | 
| 232 | 
            +
                    return styled_warning("Model authors have requested that their model be not submitted on the leaderboard.")
         | 
| 233 | 
            +
             | 
| 234 | 
             
                # Check for duplicate submission
         | 
| 235 | 
             
                if out_path.split("eval-queue/")[1].lower() in requested_models:
         | 
| 236 | 
            +
                    return styled_warning("This model has been already submitted.")    
         | 
| 237 |  | 
| 238 | 
             
                with open(out_path, "w") as f:
         | 
| 239 | 
             
                    f.write(json.dumps(eval_entry))
         | 
    	
        src/auto_leaderboard/get_model_metadata.py
    CHANGED
    
    | @@ -8,7 +8,7 @@ from tqdm import tqdm | |
| 8 |  | 
| 9 | 
             
            from src.utils_display import AutoEvalColumn, model_hyperlink
         | 
| 10 | 
             
            from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
         | 
| 11 | 
            -
            from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS
         | 
| 12 |  | 
| 13 | 
             
            from huggingface_hub import HfApi
         | 
| 14 | 
             
            import huggingface_hub
         | 
| @@ -106,7 +106,18 @@ def flag_models(leaderboard_data:List[dict]): | |
| 106 | 
             
                        issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
         | 
| 107 | 
             
                        model_data[AutoEvalColumn.model.name] =  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
         | 
| 108 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 109 | 
             
            def apply_metadata(leaderboard_data: List[dict]):
         | 
|  | |
| 110 | 
             
                get_model_type(leaderboard_data)
         | 
| 111 | 
             
                get_model_infos_from_hub(leaderboard_data)
         | 
| 112 | 
             
                flag_models(leaderboard_data)
         | 
|  | |
| 8 |  | 
| 9 | 
             
            from src.utils_display import AutoEvalColumn, model_hyperlink
         | 
| 10 | 
             
            from src.auto_leaderboard.model_metadata_type import ModelType, model_type_from_str, MODEL_TYPE_METADATA
         | 
| 11 | 
            +
            from src.auto_leaderboard.model_metadata_flags import FLAGGED_MODELS, DO_NOT_SUBMIT_MODELS
         | 
| 12 |  | 
| 13 | 
             
            from huggingface_hub import HfApi
         | 
| 14 | 
             
            import huggingface_hub
         | 
|  | |
| 106 | 
             
                        issue_link = model_hyperlink(FLAGGED_MODELS[model_data["model_name_for_query"]], f"See discussion #{issue_num}")
         | 
| 107 | 
             
                        model_data[AutoEvalColumn.model.name] =  f"{model_data[AutoEvalColumn.model.name]} has been flagged! {issue_link}"
         | 
| 108 |  | 
| 109 | 
            +
            def remove_forbidden_models(leaderboard_data: List[dict]):
         | 
| 110 | 
            +
                indices_to_remove = []
         | 
| 111 | 
            +
                for ix, model in enumerate(leaderboard_data):
         | 
| 112 | 
            +
                    if model["model_name_for_query"] in DO_NOT_SUBMIT_MODELS:
         | 
| 113 | 
            +
                        indices_to_remove.append(ix)
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                for ix in reversed(indices_to_remove):
         | 
| 116 | 
            +
                    leaderboard_data.pop(ix)
         | 
| 117 | 
            +
                return leaderboard_data
         | 
| 118 | 
            +
             | 
| 119 | 
             
            def apply_metadata(leaderboard_data: List[dict]):
         | 
| 120 | 
            +
                leaderboard_data = remove_forbidden_models(leaderboard_data)
         | 
| 121 | 
             
                get_model_type(leaderboard_data)
         | 
| 122 | 
             
                get_model_infos_from_hub(leaderboard_data)
         | 
| 123 | 
             
                flag_models(leaderboard_data)
         | 
    	
        src/auto_leaderboard/model_metadata_flags.py
    CHANGED
    
    | @@ -1,6 +1,12 @@ | |
| 1 | 
            -
            #  | 
|  | |
| 2 | 
             
            FLAGGED_MODELS = {
         | 
| 3 | 
             
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
         | 
| 4 | 
             
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
         | 
| 5 | 
             
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
         | 
| 6 | 
            -
            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # Models which have been flagged by users as being problematic for a reason or another
         | 
| 2 | 
            +
            # (Model name to forum discussion link)
         | 
| 3 | 
             
            FLAGGED_MODELS = {
         | 
| 4 | 
             
                "Voicelab/trurl-2-13b": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/202",
         | 
| 5 | 
             
                "deepnight-research/llama-2-70B-inst": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/207",
         | 
| 6 | 
             
                "Aspik101/trurl-2-13b-pl-instruct_unload": "https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard/discussions/213",
         | 
| 7 | 
            +
            }
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # Models which have been requested by orgs to not be submitted on the leaderboard
         | 
| 10 | 
            +
            DO_NOT_SUBMIT_MODELS = [
         | 
| 11 | 
            +
                "Voicelab/trurl-2-13b", # trained on MMLU
         | 
| 12 | 
            +
            ]
         | 
