Spaces:
				
			
			
	
			
			
					
		Running
		
			on 
			
			CPU Upgrade
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
			on 
			
			CPU Upgrade
	
		Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						12cea14
	
1
								Parent(s):
							
							99b25b8
								
FT: precision and adapter models
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -28,7 +28,6 @@ PRIVATE_QUEUE_REPO = "open-llm-leaderboard/private-requests" 
     | 
|
| 28 | 
         
             
            PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
         
     | 
| 29 | 
         | 
| 30 | 
         
             
            IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
         
     | 
| 31 | 
         
            -
            ADD_PLOTS = False
         
     | 
| 32 | 
         | 
| 33 | 
         
             
            EVAL_REQUESTS_PATH = "eval-queue"
         
     | 
| 34 | 
         
             
            EVAL_RESULTS_PATH = "eval-results"
         
     | 
| 
         @@ -56,8 +55,8 @@ COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default an 
     | 
|
| 56 | 
         
             
            TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
         
     | 
| 57 | 
         | 
| 58 | 
         
             
            if not IS_PUBLIC:
         
     | 
| 59 | 
         
            -
                COLS.insert(2, AutoEvalColumn. 
     | 
| 60 | 
         
            -
                TYPES.insert(2, AutoEvalColumn. 
     | 
| 61 | 
         | 
| 62 | 
         
             
            EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
         
     | 
| 63 | 
         
             
            EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
         
     | 
| 
         @@ -177,25 +176,27 @@ def add_new_eval( 
     | 
|
| 177 | 
         
             
                model: str,
         
     | 
| 178 | 
         
             
                base_model: str,
         
     | 
| 179 | 
         
             
                revision: str,
         
     | 
| 180 | 
         
            -
                 
     | 
| 181 | 
         
             
                private: bool,
         
     | 
| 182 | 
         
            -
                 
     | 
| 183 | 
         
             
            ):
         
     | 
| 
         | 
|
| 184 | 
         
             
                current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         
     | 
| 185 | 
         | 
| 186 | 
         
             
                # check the model actually exists before adding the eval
         
     | 
| 187 | 
         
             
                if revision == "":
         
     | 
| 188 | 
         
             
                    revision = "main"
         
     | 
| 189 | 
         | 
| 190 | 
         
            -
                if  
     | 
| 191 | 
         
             
                    base_model_on_hub, error = is_model_on_hub(base_model, revision)
         
     | 
| 192 | 
         
             
                    if not base_model_on_hub:
         
     | 
| 193 | 
         
             
                        return styled_error(f'Base model "{base_model}" {error}')
         
     | 
| 
         | 
|
| 194 | 
         | 
| 195 | 
         
             
                model_on_hub, error = is_model_on_hub(model, revision)
         
     | 
| 196 | 
         
             
                if not model_on_hub:
         
     | 
| 197 | 
         
             
                    return styled_error(f'Model "{model}" {error}')
         
     | 
| 198 | 
         
            -
             
     | 
| 199 | 
         
             
                print("adding new eval")
         
     | 
| 200 | 
         | 
| 201 | 
         
             
                eval_entry = {
         
     | 
| 
         @@ -203,8 +204,8 @@ def add_new_eval( 
     | 
|
| 203 | 
         
             
                    "base_model": base_model,
         
     | 
| 204 | 
         
             
                    "revision": revision,
         
     | 
| 205 | 
         
             
                    "private": private,
         
     | 
| 206 | 
         
            -
                    " 
     | 
| 207 | 
         
            -
                    " 
     | 
| 208 | 
         
             
                    "status": "PENDING",
         
     | 
| 209 | 
         
             
                    "submitted_time": current_time,
         
     | 
| 210 | 
         
             
                }
         
     | 
| 
         @@ -217,7 +218,7 @@ def add_new_eval( 
     | 
|
| 217 | 
         | 
| 218 | 
         
             
                OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
         
     | 
| 219 | 
         
             
                os.makedirs(OUT_DIR, exist_ok=True)
         
     | 
| 220 | 
         
            -
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{ 
     | 
| 221 | 
         | 
| 222 | 
         
             
                # Check for duplicate submission
         
     | 
| 223 | 
         
             
                if out_path.split("eval-queue/")[1].lower() in requested_models:
         
     | 
| 
         @@ -381,17 +382,29 @@ with demo: 
     | 
|
| 381 | 
         
             
                                revision_name_textbox = gr.Textbox(
         
     | 
| 382 | 
         
             
                                    label="revision", placeholder="main"
         
     | 
| 383 | 
         
             
                                )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 384 | 
         | 
| 385 | 
         
             
                            with gr.Column():
         
     | 
| 386 | 
         
            -
                                 
     | 
| 387 | 
         
            -
                                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 388 | 
         
             
                                )
         
     | 
| 389 | 
         
            -
                                 
     | 
| 390 | 
         
            -
                                     
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 391 | 
         
             
                                )
         
     | 
| 392 | 
         
            -
                                is_delta_weight = gr.Checkbox(False, label="Delta weights")
         
     | 
| 393 | 
         
             
                                base_model_name_textbox = gr.Textbox(
         
     | 
| 394 | 
         
            -
                                    label=" 
     | 
| 395 | 
         
             
                                )
         
     | 
| 396 | 
         | 
| 397 | 
         
             
                        submit_button = gr.Button("Submit Eval")
         
     | 
| 
         @@ -402,9 +415,9 @@ with demo: 
     | 
|
| 402 | 
         
             
                                model_name_textbox,
         
     | 
| 403 | 
         
             
                                base_model_name_textbox,
         
     | 
| 404 | 
         
             
                                revision_name_textbox,
         
     | 
| 405 | 
         
            -
                                 
     | 
| 406 | 
         
             
                                private,
         
     | 
| 407 | 
         
            -
                                 
     | 
| 408 | 
         
             
                            ],
         
     | 
| 409 | 
         
             
                            submission_result,
         
     | 
| 410 | 
         
             
                        )
         
     | 
| 
         | 
|
| 28 | 
         
             
            PRIVATE_RESULTS_REPO = "open-llm-leaderboard/private-results"
         
     | 
| 29 | 
         | 
| 30 | 
         
             
            IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
         
     | 
| 
         | 
|
| 31 | 
         | 
| 32 | 
         
             
            EVAL_REQUESTS_PATH = "eval-queue"
         
     | 
| 33 | 
         
             
            EVAL_RESULTS_PATH = "eval-results"
         
     | 
| 
         | 
|
| 55 | 
         
             
            TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
         
     | 
| 56 | 
         | 
| 57 | 
         
             
            if not IS_PUBLIC:
         
     | 
| 58 | 
         
            +
                COLS.insert(2, AutoEvalColumn.precision.name)
         
     | 
| 59 | 
         
            +
                TYPES.insert(2, AutoEvalColumn.precision.type)
         
     | 
| 60 | 
         | 
| 61 | 
         
             
            EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
         
     | 
| 62 | 
         
             
            EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
         
     | 
| 
         | 
|
| 176 | 
         
             
                model: str,
         
     | 
| 177 | 
         
             
                base_model: str,
         
     | 
| 178 | 
         
             
                revision: str,
         
     | 
| 179 | 
         
            +
                precision: str,
         
     | 
| 180 | 
         
             
                private: bool,
         
     | 
| 181 | 
         
            +
                weight_type: str,
         
     | 
| 182 | 
         
             
            ):
         
     | 
| 183 | 
         
            +
                precision = precision.split(" ")[0]
         
     | 
| 184 | 
         
             
                current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
         
     | 
| 185 | 
         | 
| 186 | 
         
             
                # check the model actually exists before adding the eval
         
     | 
| 187 | 
         
             
                if revision == "":
         
     | 
| 188 | 
         
             
                    revision = "main"
         
     | 
| 189 | 
         | 
| 190 | 
         
            +
                if weight_type in ["Delta", "Adapter"]:
         
     | 
| 191 | 
         
             
                    base_model_on_hub, error = is_model_on_hub(base_model, revision)
         
     | 
| 192 | 
         
             
                    if not base_model_on_hub:
         
     | 
| 193 | 
         
             
                        return styled_error(f'Base model "{base_model}" {error}')
         
     | 
| 194 | 
         
            +
                    
         
     | 
| 195 | 
         | 
| 196 | 
         
             
                model_on_hub, error = is_model_on_hub(model, revision)
         
     | 
| 197 | 
         
             
                if not model_on_hub:
         
     | 
| 198 | 
         
             
                    return styled_error(f'Model "{model}" {error}')
         
     | 
| 199 | 
         
            +
                
         
     | 
| 200 | 
         
             
                print("adding new eval")
         
     | 
| 201 | 
         | 
| 202 | 
         
             
                eval_entry = {
         
     | 
| 
         | 
|
| 204 | 
         
             
                    "base_model": base_model,
         
     | 
| 205 | 
         
             
                    "revision": revision,
         
     | 
| 206 | 
         
             
                    "private": private,
         
     | 
| 207 | 
         
            +
                    "precision": precision,
         
     | 
| 208 | 
         
            +
                    "weight_type": weight_type,
         
     | 
| 209 | 
         
             
                    "status": "PENDING",
         
     | 
| 210 | 
         
             
                    "submitted_time": current_time,
         
     | 
| 211 | 
         
             
                }
         
     | 
| 
         | 
|
| 218 | 
         | 
| 219 | 
         
             
                OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
         
     | 
| 220 | 
         
             
                os.makedirs(OUT_DIR, exist_ok=True)
         
     | 
| 221 | 
         
            +
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{precision}_{weight_type}.json"
         
     | 
| 222 | 
         | 
| 223 | 
         
             
                # Check for duplicate submission
         
     | 
| 224 | 
         
             
                if out_path.split("eval-queue/")[1].lower() in requested_models:
         
     | 
| 
         | 
|
| 382 | 
         
             
                                revision_name_textbox = gr.Textbox(
         
     | 
| 383 | 
         
             
                                    label="revision", placeholder="main"
         
     | 
| 384 | 
         
             
                                )
         
     | 
| 385 | 
         
            +
                                private = gr.Checkbox(
         
     | 
| 386 | 
         
            +
                                    False, label="Private", visible=not IS_PUBLIC
         
     | 
| 387 | 
         
            +
                                )
         
     | 
| 388 | 
         | 
| 389 | 
         
             
                            with gr.Column():
         
     | 
| 390 | 
         
            +
                                precision = gr.Dropdown(
         
     | 
| 391 | 
         
            +
                                    choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)"], 
         
     | 
| 392 | 
         
            +
                                    label="Precision", 
         
     | 
| 393 | 
         
            +
                                    multiselect=False,
         
     | 
| 394 | 
         
            +
                                    value="float16",
         
     | 
| 395 | 
         
            +
                                    max_choices=1,
         
     | 
| 396 | 
         
            +
                                    interactive=True,
         
     | 
| 397 | 
         
             
                                )
         
     | 
| 398 | 
         
            +
                                weight_type = gr.Dropdown(
         
     | 
| 399 | 
         
            +
                                    choices=["Original", "Delta", "Adapter"],
         
     | 
| 400 | 
         
            +
                                    label="Weights type", 
         
     | 
| 401 | 
         
            +
                                    multiselect=False,
         
     | 
| 402 | 
         
            +
                                    value="Original",
         
     | 
| 403 | 
         
            +
                                    max_choices=1,
         
     | 
| 404 | 
         
            +
                                    interactive=True,
         
     | 
| 405 | 
         
             
                                )
         
     | 
| 
         | 
|
| 406 | 
         
             
                                base_model_name_textbox = gr.Textbox(
         
     | 
| 407 | 
         
            +
                                    label="Base model (for delta or adapter weights)"
         
     | 
| 408 | 
         
             
                                )
         
     | 
| 409 | 
         | 
| 410 | 
         
             
                        submit_button = gr.Button("Submit Eval")
         
     | 
| 
         | 
|
| 415 | 
         
             
                                model_name_textbox,
         
     | 
| 416 | 
         
             
                                base_model_name_textbox,
         
     | 
| 417 | 
         
             
                                revision_name_textbox,
         
     | 
| 418 | 
         
            +
                                precision,
         
     | 
| 419 | 
         
             
                                private,
         
     | 
| 420 | 
         
            +
                                weight_type,
         
     | 
| 421 | 
         
             
                            ],
         
     | 
| 422 | 
         
             
                            submission_result,
         
     | 
| 423 | 
         
             
                        )
         
     | 
    	
        src/assets/hardcoded_evals.py
    CHANGED
    
    | 
         @@ -3,7 +3,7 @@ from src.utils_display import AutoEvalColumn, model_hyperlink 
     | 
|
| 3 | 
         
             
            gpt4_values = {
         
     | 
| 4 | 
         
             
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
         
     | 
| 5 | 
         
             
                AutoEvalColumn.revision.name: "tech report",
         
     | 
| 6 | 
         
            -
                AutoEvalColumn. 
     | 
| 7 | 
         
             
                AutoEvalColumn.average.name: 84.3,
         
     | 
| 8 | 
         
             
                AutoEvalColumn.arc.name: 96.3,
         
     | 
| 9 | 
         
             
                AutoEvalColumn.hellaswag.name:  95.3,
         
     | 
| 
         @@ -15,7 +15,7 @@ gpt4_values = { 
     | 
|
| 15 | 
         
             
            gpt35_values = {
         
     | 
| 16 | 
         
             
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
         
     | 
| 17 | 
         
             
                AutoEvalColumn.revision.name: "tech report",
         
     | 
| 18 | 
         
            -
                AutoEvalColumn. 
     | 
| 19 | 
         
             
                AutoEvalColumn.average.name: 71.9,
         
     | 
| 20 | 
         
             
                AutoEvalColumn.arc.name: 85.2,
         
     | 
| 21 | 
         
             
                AutoEvalColumn.hellaswag.name:  85.5,
         
     | 
| 
         @@ -27,7 +27,7 @@ gpt35_values = { 
     | 
|
| 27 | 
         
             
            baseline = {
         
     | 
| 28 | 
         
             
                AutoEvalColumn.model.name: "<p>Baseline</p>",
         
     | 
| 29 | 
         
             
                AutoEvalColumn.revision.name: "N/A",
         
     | 
| 30 | 
         
            -
                AutoEvalColumn. 
     | 
| 31 | 
         
             
                AutoEvalColumn.average.name: 25.0,
         
     | 
| 32 | 
         
             
                AutoEvalColumn.arc.name: 25.0,
         
     | 
| 33 | 
         
             
                AutoEvalColumn.hellaswag.name:  25.0,
         
     | 
| 
         | 
|
| 3 | 
         
             
            gpt4_values = {
         
     | 
| 4 | 
         
             
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
         
     | 
| 5 | 
         
             
                AutoEvalColumn.revision.name: "tech report",
         
     | 
| 6 | 
         
            +
                AutoEvalColumn.precision.name: None,
         
     | 
| 7 | 
         
             
                AutoEvalColumn.average.name: 84.3,
         
     | 
| 8 | 
         
             
                AutoEvalColumn.arc.name: 96.3,
         
     | 
| 9 | 
         
             
                AutoEvalColumn.hellaswag.name:  95.3,
         
     | 
| 
         | 
|
| 15 | 
         
             
            gpt35_values = {
         
     | 
| 16 | 
         
             
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
         
     | 
| 17 | 
         
             
                AutoEvalColumn.revision.name: "tech report",
         
     | 
| 18 | 
         
            +
                AutoEvalColumn.precision.name: None,
         
     | 
| 19 | 
         
             
                AutoEvalColumn.average.name: 71.9,
         
     | 
| 20 | 
         
             
                AutoEvalColumn.arc.name: 85.2,
         
     | 
| 21 | 
         
             
                AutoEvalColumn.hellaswag.name:  85.5,
         
     | 
| 
         | 
|
| 27 | 
         
             
            baseline = {
         
     | 
| 28 | 
         
             
                AutoEvalColumn.model.name: "<p>Baseline</p>",
         
     | 
| 29 | 
         
             
                AutoEvalColumn.revision.name: "N/A",
         
     | 
| 30 | 
         
            +
                AutoEvalColumn.precision.name: None,
         
     | 
| 31 | 
         
             
                AutoEvalColumn.average.name: 25.0,
         
     | 
| 32 | 
         
             
                AutoEvalColumn.arc.name: 25.0,
         
     | 
| 33 | 
         
             
                AutoEvalColumn.hellaswag.name:  25.0,
         
     | 
    	
        src/assets/text_content.py
    CHANGED
    
    | 
         @@ -122,12 +122,16 @@ The tasks and few shots parameters are: 
     | 
|
| 122 | 
         
             
            - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
         
     | 
| 123 | 
         
             
            - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
         
     | 
| 124 | 
         | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 125 | 
         
             
            # In case of model failure
         
     | 
| 126 | 
         
             
            If your model is displayed in the `FAILED` category, its execution stopped. 
         
     | 
| 127 | 
         
             
            Make sure you have followed the above steps first. 
         
     | 
| 128 | 
         
             
            If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
         
     | 
| 129 | 
         | 
| 130 | 
         
            -
             
     | 
| 131 | 
         
             
            """
         
     | 
| 132 | 
         | 
| 133 | 
         
             
            EVALUATION_QUEUE_TEXT = f"""
         
     | 
| 
         | 
|
| 122 | 
         
             
            - TruthfulQA: 0-shot, *truthfulqa-mc* (`mc2`)
         
     | 
| 123 | 
         
             
            - MMLU: 5-shot, *hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions* (`acc` of `all`)
         
     | 
| 124 | 
         | 
| 125 | 
         
            +
            ### Quantization
         
     | 
| 126 | 
         
            +
            To get more information about quantization, see:
         
     | 
| 127 | 
         
            +
            - 8 bits: [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration), [paper](https://arxiv.org/abs/2208.07339)
         
     | 
| 128 | 
         
            +
            - 4 bits: [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes), [paper](https://arxiv.org/abs/2305.14314)
         
     | 
| 129 | 
         
            +
             
     | 
| 130 | 
         
             
            # In case of model failure
         
     | 
| 131 | 
         
             
            If your model is displayed in the `FAILED` category, its execution stopped. 
         
     | 
| 132 | 
         
             
            Make sure you have followed the above steps first. 
         
     | 
| 133 | 
         
             
            If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
         
     | 
| 134 | 
         | 
| 
         | 
|
| 135 | 
         
             
            """
         
     | 
| 136 | 
         | 
| 137 | 
         
             
            EVALUATION_QUEUE_TEXT = f"""
         
     | 
    	
        src/auto_leaderboard/get_model_metadata.py
    CHANGED
    
    | 
         @@ -36,7 +36,7 @@ def get_model_license(model_info): 
     | 
|
| 36 | 
         
             
            def get_model_likes(model_info):
         
     | 
| 37 | 
         
             
                return model_info.likes
         
     | 
| 38 | 
         | 
| 39 | 
         
            -
            size_pattern = re.compile(r"\d+(b|m)")
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            def get_model_size(model_name, model_info):
         
     | 
| 42 | 
         
             
                # In billions
         
     | 
| 
         @@ -46,7 +46,7 @@ def get_model_size(model_name, model_info): 
     | 
|
| 46 | 
         
             
                    try:
         
     | 
| 47 | 
         
             
                        size_match = re.search(size_pattern, model_name.lower())
         
     | 
| 48 | 
         
             
                        size = size_match.group(0)
         
     | 
| 49 | 
         
            -
                        return round( 
     | 
| 50 | 
         
             
                    except AttributeError:
         
     | 
| 51 | 
         
             
                        return None
         
     | 
| 52 | 
         | 
| 
         | 
|
| 36 | 
         
             
            def get_model_likes(model_info):
         
     | 
| 37 | 
         
             
                return model_info.likes
         
     | 
| 38 | 
         | 
| 39 | 
         
            +
            size_pattern = re.compile(r"(\d\.)?\d+(b|m)")
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            def get_model_size(model_name, model_info):
         
     | 
| 42 | 
         
             
                # In billions
         
     | 
| 
         | 
|
| 46 | 
         
             
                    try:
         
     | 
| 47 | 
         
             
                        size_match = re.search(size_pattern, model_name.lower())
         
     | 
| 48 | 
         
             
                        size = size_match.group(0)
         
     | 
| 49 | 
         
            +
                        return round(float(size[:-1]) if size[-1] == "b" else float(size[:-1]) / 1e3, 3)
         
     | 
| 50 | 
         
             
                    except AttributeError:
         
     | 
| 51 | 
         
             
                        return None
         
     | 
| 52 | 
         | 
    	
        src/auto_leaderboard/load_results.py
    CHANGED
    
    | 
         @@ -24,7 +24,7 @@ class EvalResult: 
     | 
|
| 24 | 
         
             
                model: str
         
     | 
| 25 | 
         
             
                revision: str
         
     | 
| 26 | 
         
             
                results: dict
         
     | 
| 27 | 
         
            -
                 
     | 
| 28 | 
         | 
| 29 | 
         
             
                def to_dict(self):
         
     | 
| 30 | 
         
             
                    if self.org is not None:
         
     | 
| 
         @@ -34,7 +34,7 @@ class EvalResult: 
     | 
|
| 34 | 
         
             
                    data_dict = {}
         
     | 
| 35 | 
         | 
| 36 | 
         
             
                    data_dict["eval_name"] = self.eval_name # not a column, just a save name
         
     | 
| 37 | 
         
            -
                    data_dict[AutoEvalColumn. 
     | 
| 38 | 
         
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         
     | 
| 39 | 
         
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         
     | 
| 40 | 
         
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         
     | 
| 
         | 
|
| 24 | 
         
             
                model: str
         
     | 
| 25 | 
         
             
                revision: str
         
     | 
| 26 | 
         
             
                results: dict
         
     | 
| 27 | 
         
            +
                precision: str = "16bit"
         
     | 
| 28 | 
         | 
| 29 | 
         
             
                def to_dict(self):
         
     | 
| 30 | 
         
             
                    if self.org is not None:
         
     | 
| 
         | 
|
| 34 | 
         
             
                    data_dict = {}
         
     | 
| 35 | 
         | 
| 36 | 
         
             
                    data_dict["eval_name"] = self.eval_name # not a column, just a save name
         
     | 
| 37 | 
         
            +
                    data_dict[AutoEvalColumn.precision.name] = self.precision
         
     | 
| 38 | 
         
             
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         
     | 
| 39 | 
         
             
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         
     | 
| 40 | 
         
             
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         
     | 
    	
        src/auto_leaderboard/model_metadata_type.py
    CHANGED
    
    | 
         @@ -161,3 +161,12 @@ TYPE_METADATA: Dict[str, ModelType] = { 
     | 
|
| 161 | 
         
             
            def get_model_type(leaderboard_data: List[dict]):
         
     | 
| 162 | 
         
             
                for model_data in leaderboard_data:
         
     | 
| 163 | 
         
             
                    model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
| 
         | 
|
| 161 | 
         
             
            def get_model_type(leaderboard_data: List[dict]):
         
     | 
| 162 | 
         
             
                for model_data in leaderboard_data:
         
     | 
| 163 | 
         
             
                    model_data["Type"] = TYPE_METADATA.get(model_data["model_name_for_query"], "N/A")
         
     | 
| 164 | 
         
            +
                    if model_data["Type"] == "N/A":
         
     | 
| 165 | 
         
            +
                        if any([i in model_data["model_name_for_query"] for i in ["finetuned", "-ft-"]]):
         
     | 
| 166 | 
         
            +
                            model_data["Type"] = ModelType.SFT
         
     | 
| 167 | 
         
            +
                        elif any([i in model_data["model_name_for_query"] for i in ["pretrained"]]):
         
     | 
| 168 | 
         
            +
                            model_data["Type"] = ModelType.PT
         
     | 
| 169 | 
         
            +
                        elif any([i in model_data["model_name_for_query"] for i in ["-rl-", "-rlhf-"]]):
         
     | 
| 170 | 
         
            +
                            model_data["Type"] = ModelType.RL
         
     | 
| 171 | 
         
            +
             
         
     | 
| 172 | 
         
            +
             
         
     | 
    	
        src/utils_display.py
    CHANGED
    
    | 
         @@ -20,8 +20,8 @@ class AutoEvalColumn: # Auto evals column 
     | 
|
| 20 | 
         
             
                hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
         
     | 
| 21 | 
         
             
                mmlu = ColumnContent("MMLU ⬆️", "number", True)
         
     | 
| 22 | 
         
             
                truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
         
     | 
| 23 | 
         
            -
                model_type = ColumnContent("Type", " 
     | 
| 24 | 
         
            -
                 
     | 
| 25 | 
         
             
                license = ColumnContent("Hub License", "str", False)
         
     | 
| 26 | 
         
             
                params = ColumnContent("#Params (B)", "number", False)
         
     | 
| 27 | 
         
             
                likes = ColumnContent("Hub ❤️", "number", False)
         
     | 
| 
         @@ -42,8 +42,8 @@ class EvalQueueColumn: # Queue column 
     | 
|
| 42 | 
         
             
                model = ColumnContent("model", "markdown", True)
         
     | 
| 43 | 
         
             
                revision = ColumnContent("revision", "str", True)
         
     | 
| 44 | 
         
             
                private = ColumnContent("private", "bool", True)
         
     | 
| 45 | 
         
            -
                 
     | 
| 46 | 
         
            -
                 
     | 
| 47 | 
         
             
                status = ColumnContent("status", "str", True)
         
     | 
| 48 | 
         | 
| 49 | 
         
             
            LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
         
     | 
| 
         | 
|
| 20 | 
         
             
                hellaswag = ColumnContent("HellaSwag ⬆️", "number", True)
         
     | 
| 21 | 
         
             
                mmlu = ColumnContent("MMLU ⬆️", "number", True)
         
     | 
| 22 | 
         
             
                truthfulqa = ColumnContent("TruthfulQA (MC) ⬆️", "number", True)
         
     | 
| 23 | 
         
            +
                model_type = ColumnContent("Type", "str", False)
         
     | 
| 24 | 
         
            +
                precision = ColumnContent("Precision", "str", False, True)
         
     | 
| 25 | 
         
             
                license = ColumnContent("Hub License", "str", False)
         
     | 
| 26 | 
         
             
                params = ColumnContent("#Params (B)", "number", False)
         
     | 
| 27 | 
         
             
                likes = ColumnContent("Hub ❤️", "number", False)
         
     | 
| 
         | 
|
| 42 | 
         
             
                model = ColumnContent("model", "markdown", True)
         
     | 
| 43 | 
         
             
                revision = ColumnContent("revision", "str", True)
         
     | 
| 44 | 
         
             
                private = ColumnContent("private", "bool", True)
         
     | 
| 45 | 
         
            +
                precision = ColumnContent("precision", "bool", True)
         
     | 
| 46 | 
         
            +
                weight_type = ColumnContent("weight_type", "str", "Original")
         
     | 
| 47 | 
         
             
                status = ColumnContent("status", "str", True)
         
     | 
| 48 | 
         | 
| 49 | 
         
             
            LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
         
     |