Clémentine
		
	commited on
		
		
					Commit 
							
							·
						
						460d762
	
1
								Parent(s):
							
							a7cba30
								
merge refactor
Browse files- .gitignore +2 -1
- app.py +107 -274
- src/assets/css_html_js.py +87 -0
- src/assets/hardcoded_evals.py +38 -0
- scale-hf-logo.png → src/assets/scale-hf-logo.png +0 -0
- content.py → src/assets/text_content.py +5 -1
- src/auto_leaderboard/get_model_metadata.py +54 -0
- utils.py → src/auto_leaderboard/load_results.py +23 -57
- elo_utils.py → src/elo_leaderboard/load_results.py +8 -31
- visualizations.py → src/elo_leaderboard/visualizations.py +1 -1
- src/init.py +73 -0
- src/utils_display.py +96 -0
    	
        .gitignore
    CHANGED
    
    | @@ -1,9 +1,10 @@ | |
| 1 | 
            -
             | 
| 2 | 
             
            venv/
         | 
| 3 | 
             
            __pycache__/
         | 
| 4 | 
             
            .env
         | 
| 5 | 
             
            .ipynb_checkpoints
         | 
| 6 | 
             
            *ipynb
         | 
|  | |
| 7 |  | 
| 8 | 
             
            gpt_4_evals/
         | 
| 9 | 
             
            human_evals/
         | 
|  | |
| 1 | 
            +
            auto_evals/
         | 
| 2 | 
             
            venv/
         | 
| 3 | 
             
            __pycache__/
         | 
| 4 | 
             
            .env
         | 
| 5 | 
             
            .ipynb_checkpoints
         | 
| 6 | 
             
            *ipynb
         | 
| 7 | 
            +
            .vscode/
         | 
| 8 |  | 
| 9 | 
             
            gpt_4_evals/
         | 
| 10 | 
             
            human_evals/
         | 
    	
        app.py
    CHANGED
    
    | @@ -7,19 +7,25 @@ import gradio as gr | |
| 7 | 
             
            import numpy as np
         | 
| 8 | 
             
            import pandas as pd
         | 
| 9 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 10 | 
            -
            from huggingface_hub import HfApi | 
| 11 | 
             
            from transformers import AutoConfig
         | 
| 12 |  | 
| 13 | 
            -
            from  | 
| 14 | 
            -
            from  | 
| 15 | 
            -
            from  | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 16 |  | 
| 17 | 
             
            # clone / pull the lmeh eval data
         | 
| 18 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| 19 | 
             
            LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 20 | 
             
            HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
         | 
| 21 | 
             
            GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
         | 
| 22 | 
            -
            IS_PUBLIC = bool(os.environ.get("IS_PUBLIC",  | 
|  | |
| 23 |  | 
| 24 | 
             
            api = HfApi()
         | 
| 25 |  | 
| @@ -29,113 +35,25 @@ def restart_space(): | |
| 29 | 
             
                    repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
         | 
| 30 | 
             
                )
         | 
| 31 |  | 
|  | |
| 32 |  | 
| 33 | 
            -
             | 
| 34 | 
            -
             | 
| 35 | 
            -
             | 
|  | |
| 36 |  | 
| 37 | 
            -
             | 
| 38 | 
            -
             | 
| 39 | 
            -
             | 
| 40 | 
            -
                        file_names.extend([os.path.join(root, file) for file in files])
         | 
| 41 | 
            -
             | 
| 42 | 
            -
                return set([file_name.lower().split("./evals/")[1] for file_name in file_names])
         | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
            -
            repo = None
         | 
| 46 | 
            -
            requested_models = None
         | 
| 47 | 
            -
            if H4_TOKEN:
         | 
| 48 | 
            -
                print("Pulling evaluation requests and results.")
         | 
| 49 | 
            -
                # try:
         | 
| 50 | 
            -
                #     shutil.rmtree("./evals/")
         | 
| 51 | 
            -
                # except:
         | 
| 52 | 
            -
                #     pass
         | 
| 53 | 
            -
             | 
| 54 | 
            -
                repo = Repository(
         | 
| 55 | 
            -
                    local_dir="./evals/",
         | 
| 56 | 
            -
                    clone_from=LMEH_REPO,
         | 
| 57 | 
            -
                    use_auth_token=H4_TOKEN,
         | 
| 58 | 
            -
                    repo_type="dataset",
         | 
| 59 | 
            -
                )
         | 
| 60 | 
            -
                repo.git_pull()
         | 
| 61 | 
            -
             | 
| 62 | 
            -
                requested_models_dir = "./evals/eval_requests"
         | 
| 63 | 
            -
                requested_models = get_all_requested_models(requested_models_dir)
         | 
| 64 | 
            -
             | 
| 65 | 
            -
            human_eval_repo = None
         | 
| 66 | 
            -
            if H4_TOKEN and not os.path.isdir("./human_evals"):
         | 
| 67 | 
            -
                print("Pulling human evaluation repo")
         | 
| 68 | 
            -
                human_eval_repo = Repository(
         | 
| 69 | 
            -
                    local_dir="./human_evals/",
         | 
| 70 | 
            -
                    clone_from=HUMAN_EVAL_REPO,
         | 
| 71 | 
            -
                    use_auth_token=H4_TOKEN,
         | 
| 72 | 
            -
                    repo_type="dataset",
         | 
| 73 | 
            -
                )
         | 
| 74 | 
            -
                human_eval_repo.git_pull()
         | 
| 75 | 
            -
             | 
| 76 | 
            -
            gpt_4_eval_repo = None
         | 
| 77 | 
            -
            if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
         | 
| 78 | 
            -
                print("Pulling GPT-4 evaluation repo")
         | 
| 79 | 
            -
                gpt_4_eval_repo = Repository(
         | 
| 80 | 
            -
                    local_dir="./gpt_4_evals/",
         | 
| 81 | 
            -
                    clone_from=GPT_4_EVAL_REPO,
         | 
| 82 | 
            -
                    use_auth_token=H4_TOKEN,
         | 
| 83 | 
            -
                    repo_type="dataset",
         | 
| 84 | 
            -
                )
         | 
| 85 | 
            -
                gpt_4_eval_repo.git_pull()
         | 
| 86 | 
            -
             | 
| 87 | 
            -
            # parse the results
         | 
| 88 | 
            -
            BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
         | 
| 89 | 
            -
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 90 | 
            -
             | 
| 91 | 
            -
             | 
| 92 | 
            -
            def load_results(model, benchmark, metric):
         | 
| 93 | 
            -
                file_path = os.path.join("evals", model, f"{model}-eval_{benchmark}.json")
         | 
| 94 | 
            -
                if not os.path.exists(file_path):
         | 
| 95 | 
            -
                    return 0.0, None
         | 
| 96 |  | 
| 97 | 
            -
             | 
| 98 | 
            -
             | 
| 99 | 
            -
                accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 100 | 
            -
                mean_acc = np.mean(accs)
         | 
| 101 | 
            -
                return mean_acc, data["config"]["model_args"]
         | 
| 102 |  | 
|  | |
| 103 |  | 
| 104 | 
            -
             | 
| 105 | 
            -
             | 
| 106 | 
            -
             | 
| 107 | 
            -
                "Average ⬆️",
         | 
| 108 | 
            -
                "ARC (25-shot) ⬆️",
         | 
| 109 | 
            -
                "HellaSwag (10-shot) ⬆️",
         | 
| 110 | 
            -
                "MMLU (5-shot) ⬆️",
         | 
| 111 | 
            -
                "TruthfulQA (0-shot) ⬆️",
         | 
| 112 | 
            -
                "model_name_for_query",  # dummy column to implement search bar (hidden by custom CSS)
         | 
| 113 | 
            -
            ]
         | 
| 114 | 
            -
            TYPES = ["markdown", "str", "number", "number", "number", "number", "number", "str"]
         | 
| 115 | 
            -
             | 
| 116 | 
            -
            if not IS_PUBLIC:
         | 
| 117 | 
            -
                COLS.insert(2, "8bit")
         | 
| 118 | 
            -
                TYPES.insert(2, "bool")
         | 
| 119 | 
            -
             | 
| 120 | 
            -
            EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
         | 
| 121 | 
            -
            EVAL_TYPES = ["markdown", "str", "bool", "bool", "bool", "str"]
         | 
| 122 | 
            -
             | 
| 123 | 
            -
            BENCHMARK_COLS = [
         | 
| 124 | 
            -
                "ARC (25-shot) ⬆️",
         | 
| 125 | 
            -
                "HellaSwag (10-shot) ⬆️",
         | 
| 126 | 
            -
                "MMLU (5-shot) ⬆️",
         | 
| 127 | 
            -
                "TruthfulQA (0-shot) ⬆️",
         | 
| 128 | 
            -
            ]
         | 
| 129 | 
            -
             | 
| 130 | 
            -
            ELO_COLS = [
         | 
| 131 | 
            -
                "Model",
         | 
| 132 | 
            -
                "GPT-4 (all)",
         | 
| 133 | 
            -
                "Human (all)",
         | 
| 134 | 
            -
                "Human (instruct)",
         | 
| 135 | 
            -
                "Human (code-instruct)",
         | 
| 136 | 
            -
            ]
         | 
| 137 | 
            -
            ELO_TYPES = ["markdown", "number", "number", "number", "number"]
         | 
| 138 | 
            -
            ELO_SORT_COL = "GPT-4 (all)"
         | 
| 139 |  | 
| 140 |  | 
| 141 | 
             
            def has_no_nan_values(df, columns):
         | 
| @@ -147,54 +65,21 @@ def has_nan_values(df, columns): | |
| 147 |  | 
| 148 |  | 
| 149 | 
             
            def get_leaderboard_df():
         | 
| 150 | 
            -
                if  | 
| 151 | 
             
                    print("Pulling evaluation results for the leaderboard.")
         | 
| 152 | 
            -
                     | 
| 153 |  | 
| 154 | 
             
                all_data = get_eval_results_dicts(IS_PUBLIC)
         | 
| 155 |  | 
| 156 | 
             
                if not IS_PUBLIC:
         | 
| 157 | 
            -
                    gpt4_values = {
         | 
| 158 | 
            -
                        "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
         | 
| 159 | 
            -
                        "Revision": "tech report",
         | 
| 160 | 
            -
                        "8bit": None,
         | 
| 161 | 
            -
                        "Average ⬆️": 84.3,
         | 
| 162 | 
            -
                        "ARC (25-shot) ⬆️": 96.3,
         | 
| 163 | 
            -
                        "HellaSwag (10-shot) ⬆️": 95.3,
         | 
| 164 | 
            -
                        "MMLU (5-shot) ⬆️": 86.4,
         | 
| 165 | 
            -
                        "TruthfulQA (0-shot) ⬆️": 59.0,
         | 
| 166 | 
            -
                        "model_name_for_query": "GPT-4",
         | 
| 167 | 
            -
                    }
         | 
| 168 | 
             
                    all_data.append(gpt4_values)
         | 
| 169 | 
            -
                    gpt35_values = {
         | 
| 170 | 
            -
                        "Model": f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
         | 
| 171 | 
            -
                        "Revision": "tech report",
         | 
| 172 | 
            -
                        "8bit": None,
         | 
| 173 | 
            -
                        "Average ⬆️": 71.9,
         | 
| 174 | 
            -
                        "ARC (25-shot) ⬆️": 85.2,
         | 
| 175 | 
            -
                        "HellaSwag (10-shot) ⬆️": 85.5,
         | 
| 176 | 
            -
                        "MMLU (5-shot) ⬆️": 70.0,
         | 
| 177 | 
            -
                        "TruthfulQA (0-shot) ⬆️": 47.0,
         | 
| 178 | 
            -
                        "model_name_for_query": "GPT-3.5",
         | 
| 179 | 
            -
                    }
         | 
| 180 | 
             
                    all_data.append(gpt35_values)
         | 
| 181 |  | 
| 182 | 
            -
                 | 
| 183 | 
            -
             | 
| 184 | 
            -
                    "Revision": "N/A",
         | 
| 185 | 
            -
                    "8bit": None,
         | 
| 186 | 
            -
                    "Average ⬆️": 25.0,
         | 
| 187 | 
            -
                    "ARC (25-shot) ⬆️": 25.0,
         | 
| 188 | 
            -
                    "HellaSwag (10-shot) ⬆️": 25.0,
         | 
| 189 | 
            -
                    "MMLU (5-shot) ⬆️": 25.0,
         | 
| 190 | 
            -
                    "TruthfulQA (0-shot) ⬆️": 25.0,
         | 
| 191 | 
            -
                    "model_name_for_query": "baseline",
         | 
| 192 | 
            -
                }
         | 
| 193 | 
            -
             | 
| 194 | 
            -
                all_data.append(base_line)
         | 
| 195 |  | 
| 196 | 
             
                df = pd.DataFrame.from_records(all_data)
         | 
| 197 | 
            -
                df = df.sort_values(by=[ | 
| 198 | 
             
                df = df[COLS]
         | 
| 199 |  | 
| 200 | 
             
                # filter out if any of the benchmarks have not been produced
         | 
| @@ -203,20 +88,21 @@ def get_leaderboard_df(): | |
| 203 |  | 
| 204 |  | 
| 205 | 
             
            def get_evaluation_queue_df():
         | 
| 206 | 
            -
                 | 
|  | |
| 207 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 208 | 
            -
                     | 
| 209 |  | 
| 210 | 
             
                entries = [
         | 
| 211 | 
             
                    entry
         | 
| 212 | 
            -
                    for entry in os.listdir(" | 
| 213 | 
             
                    if not entry.startswith(".")
         | 
| 214 | 
             
                ]
         | 
| 215 | 
             
                all_evals = []
         | 
| 216 |  | 
| 217 | 
             
                for entry in entries:
         | 
| 218 | 
             
                    if ".json" in entry:
         | 
| 219 | 
            -
                        file_path = os.path.join(" | 
| 220 | 
             
                        with open(file_path) as fp:
         | 
| 221 | 
             
                            data = json.load(fp)
         | 
| 222 |  | 
| @@ -229,11 +115,11 @@ def get_evaluation_queue_df(): | |
| 229 | 
             
                        # this is a folder
         | 
| 230 | 
             
                        sub_entries = [
         | 
| 231 | 
             
                            e
         | 
| 232 | 
            -
                            for e in os.listdir(f" | 
| 233 | 
             
                            if not e.startswith(".")
         | 
| 234 | 
             
                        ]
         | 
| 235 | 
             
                        for sub_entry in sub_entries:
         | 
| 236 | 
            -
                            file_path = os.path.join(" | 
| 237 | 
             
                            with open(file_path) as fp:
         | 
| 238 | 
             
                                data = json.load(fp)
         | 
| 239 |  | 
| @@ -305,13 +191,15 @@ leaderboard_df = original_df.copy() | |
| 305 |  | 
| 306 | 
             
            def is_model_on_hub(model_name, revision) -> bool:
         | 
| 307 | 
             
                try:
         | 
| 308 | 
            -
                     | 
| 309 | 
            -
                    return True
         | 
|  | |
|  | |
|  | |
| 310 |  | 
| 311 | 
             
                except Exception as e:
         | 
| 312 | 
            -
                    print("Could not get the model config from the hub | 
| 313 | 
            -
                     | 
| 314 | 
            -
                    return False
         | 
| 315 |  | 
| 316 |  | 
| 317 | 
             
            def add_new_eval(
         | 
| @@ -327,14 +215,15 @@ def add_new_eval( | |
| 327 | 
             
                # check the model actually exists before adding the eval
         | 
| 328 | 
             
                if revision == "":
         | 
| 329 | 
             
                    revision = "main"
         | 
| 330 | 
            -
                if is_delta_weight and not is_model_on_hub(base_model, revision):
         | 
| 331 | 
            -
                    error_message = f'Base model "{base_model}" was not found on hub!'
         | 
| 332 | 
            -
                    print(error_message)
         | 
| 333 | 
            -
                    return f"<p style='color: red; font-size: 20px; text-align: center;'>{error_message}</p>"
         | 
| 334 |  | 
| 335 | 
            -
                if  | 
| 336 | 
            -
                     | 
| 337 | 
            -
                     | 
|  | |
|  | |
|  | |
|  | |
|  | |
| 338 |  | 
| 339 | 
             
                print("adding new eval")
         | 
| 340 |  | 
| @@ -355,14 +244,13 @@ def add_new_eval( | |
| 355 | 
             
                    user_name = model.split("/")[0]
         | 
| 356 | 
             
                    model_path = model.split("/")[1]
         | 
| 357 |  | 
| 358 | 
            -
                OUT_DIR = f"eval_requests/{user_name}"
         | 
| 359 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 360 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
         | 
| 361 |  | 
| 362 | 
             
                # Check for duplicate submission
         | 
| 363 | 
            -
                if out_path.lower() in requested_models:
         | 
| 364 | 
            -
                     | 
| 365 | 
            -
                    return f"<p style='color: orange; font-size: 20px; text-align: center;'>{duplicate_request_message}</p>"
         | 
| 366 |  | 
| 367 | 
             
                with open(out_path, "w") as f:
         | 
| 368 | 
             
                    f.write(json.dumps(eval_entry))
         | 
| @@ -375,8 +263,7 @@ def add_new_eval( | |
| 375 | 
             
                    repo_type="dataset",
         | 
| 376 | 
             
                )
         | 
| 377 |  | 
| 378 | 
            -
                 | 
| 379 | 
            -
                return f"<p style='color: green; font-size: 20px; text-align: center;'>{success_message}</p>"
         | 
| 380 |  | 
| 381 |  | 
| 382 | 
             
            def refresh():
         | 
| @@ -395,7 +282,7 @@ def refresh(): | |
| 395 |  | 
| 396 |  | 
| 397 | 
             
            def search_table(df, query):
         | 
| 398 | 
            -
                filtered_df = df[df[ | 
| 399 | 
             
                return filtered_df
         | 
| 400 |  | 
| 401 |  | 
| @@ -413,83 +300,6 @@ def change_tab(query_param): | |
| 413 | 
             
                    return gr.Tabs.update(selected=0)
         | 
| 414 |  | 
| 415 |  | 
| 416 | 
            -
            custom_css = """
         | 
| 417 | 
            -
            #changelog-text {
         | 
| 418 | 
            -
                font-size: 16px !important;
         | 
| 419 | 
            -
            }
         | 
| 420 | 
            -
             | 
| 421 | 
            -
            #changelog-text h2 {
         | 
| 422 | 
            -
                font-size: 18px !important;
         | 
| 423 | 
            -
            }
         | 
| 424 | 
            -
             | 
| 425 | 
            -
            .markdown-text {
         | 
| 426 | 
            -
                font-size: 16px !important;
         | 
| 427 | 
            -
            }
         | 
| 428 | 
            -
             | 
| 429 | 
            -
            #models-to-add-text {
         | 
| 430 | 
            -
                font-size: 18px !important;
         | 
| 431 | 
            -
            }
         | 
| 432 | 
            -
             | 
| 433 | 
            -
            #citation-button span {
         | 
| 434 | 
            -
                font-size: 16px !important;
         | 
| 435 | 
            -
            }
         | 
| 436 | 
            -
             | 
| 437 | 
            -
            #citation-button textarea {
         | 
| 438 | 
            -
                font-size: 16px !important;
         | 
| 439 | 
            -
            }
         | 
| 440 | 
            -
             | 
| 441 | 
            -
            #citation-button > label > button {
         | 
| 442 | 
            -
                margin: 6px;
         | 
| 443 | 
            -
                transform: scale(1.3);
         | 
| 444 | 
            -
            }
         | 
| 445 | 
            -
             | 
| 446 | 
            -
            #leaderboard-table {
         | 
| 447 | 
            -
                margin-top: 15px
         | 
| 448 | 
            -
            }
         | 
| 449 | 
            -
             | 
| 450 | 
            -
            #search-bar-table-box > div:first-child {
         | 
| 451 | 
            -
                background: none;
         | 
| 452 | 
            -
                border: none;
         | 
| 453 | 
            -
            }
         | 
| 454 | 
            -
             
         | 
| 455 | 
            -
            #search-bar {
         | 
| 456 | 
            -
                padding: 0px;
         | 
| 457 | 
            -
                width: 30%;
         | 
| 458 | 
            -
            }
         | 
| 459 | 
            -
             | 
| 460 | 
            -
            /* Hides the final column */
         | 
| 461 | 
            -
            #llm-benchmark-tab-table table td:last-child,
         | 
| 462 | 
            -
            #llm-benchmark-tab-table table th:last-child {
         | 
| 463 | 
            -
                display: none;
         | 
| 464 | 
            -
            }
         | 
| 465 | 
            -
             | 
| 466 | 
            -
            /* Limit the width of the first column so that names don't expand too much */
         | 
| 467 | 
            -
            table td:first-child,
         | 
| 468 | 
            -
            table th:first-child {
         | 
| 469 | 
            -
                max-width: 400px;
         | 
| 470 | 
            -
                overflow: auto;
         | 
| 471 | 
            -
                white-space: nowrap;
         | 
| 472 | 
            -
            }
         | 
| 473 | 
            -
             | 
| 474 | 
            -
            .tab-buttons button {
         | 
| 475 | 
            -
                font-size: 20px;
         | 
| 476 | 
            -
            }
         | 
| 477 | 
            -
             | 
| 478 | 
            -
            #scale-logo {
         | 
| 479 | 
            -
                border-style: none !important;
         | 
| 480 | 
            -
                box-shadow: none;
         | 
| 481 | 
            -
                display: block;
         | 
| 482 | 
            -
                margin-left: auto;
         | 
| 483 | 
            -
                margin-right: auto;
         | 
| 484 | 
            -
                max-width: 600px;
         | 
| 485 | 
            -
            }
         | 
| 486 | 
            -
             | 
| 487 | 
            -
            #scale-logo .download {
         | 
| 488 | 
            -
                display: none;
         | 
| 489 | 
            -
            }
         | 
| 490 | 
            -
            """
         | 
| 491 | 
            -
             | 
| 492 | 
            -
             | 
| 493 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
| 494 | 
             
            with demo:
         | 
| 495 | 
             
                gr.HTML(TITLE)
         | 
| @@ -518,30 +328,52 @@ with demo: | |
| 518 | 
             
                                    show_label=False,
         | 
| 519 | 
             
                                    elem_id="search-bar",
         | 
| 520 | 
             
                                )
         | 
| 521 | 
            -
             | 
| 522 | 
            -
             | 
| 523 | 
            -
             | 
| 524 | 
            -
             | 
| 525 | 
            -
             | 
| 526 | 
            -
             | 
| 527 | 
            -
             | 
| 528 | 
            -
             | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 529 |  | 
| 530 | 
             
                                # Dummy leaderboard for handling the case when the user uses backspace key
         | 
| 531 | 
             
                                hidden_leaderboard_table_for_search = gr.components.Dataframe(
         | 
| 532 | 
             
                                    value=original_df,
         | 
| 533 | 
             
                                    headers=COLS,
         | 
| 534 | 
             
                                    datatype=TYPES,
         | 
| 535 | 
            -
                                    max_rows= | 
| 536 | 
             
                                    visible=False,
         | 
| 537 | 
             
                                )
         | 
| 538 | 
            -
             | 
| 539 | 
             
                                search_bar.submit(
         | 
| 540 | 
             
                                    search_table,
         | 
| 541 | 
             
                                    [hidden_leaderboard_table_for_search, search_bar],
         | 
| 542 | 
             
                                    leaderboard_table,
         | 
| 543 | 
             
                                )
         | 
| 544 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 545 | 
             
                            with gr.Row():
         | 
| 546 | 
             
                                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 547 |  | 
| @@ -625,7 +457,7 @@ with demo: | |
| 625 | 
             
                                gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
         | 
| 626 | 
             
                            with gr.Column(scale=1):
         | 
| 627 | 
             
                                gr.Image(
         | 
| 628 | 
            -
                                    "scale-hf-logo.png", elem_id="scale-logo", show_label=False
         | 
| 629 | 
             
                                )
         | 
| 630 | 
             
                        gr.Markdown("## No tie allowed")
         | 
| 631 | 
             
                        elo_leaderboard_table = gr.components.Dataframe(
         | 
| @@ -660,22 +492,23 @@ with demo: | |
| 660 | 
             
                    tabs,
         | 
| 661 | 
             
                    _js=get_window_url_params,
         | 
| 662 | 
             
                )
         | 
| 663 | 
            -
                 | 
| 664 | 
            -
             | 
| 665 | 
            -
             | 
| 666 | 
            -
             | 
| 667 | 
            -
             | 
| 668 | 
            -
             | 
| 669 | 
            -
             | 
| 670 | 
            -
             | 
| 671 | 
            -
             | 
| 672 | 
            -
             | 
| 673 | 
            -
             | 
| 674 | 
            -
             | 
| 675 | 
            -
             | 
| 676 | 
            -
             | 
| 677 | 
            -
             | 
| 678 | 
            -
             | 
|  | |
| 679 |  | 
| 680 | 
             
            scheduler = BackgroundScheduler()
         | 
| 681 | 
             
            scheduler.add_job(restart_space, "interval", seconds=3600)
         | 
|  | |
| 7 | 
             
            import numpy as np
         | 
| 8 | 
             
            import pandas as pd
         | 
| 9 | 
             
            from apscheduler.schedulers.background import BackgroundScheduler
         | 
| 10 | 
            +
            from huggingface_hub import HfApi
         | 
| 11 | 
             
            from transformers import AutoConfig
         | 
| 12 |  | 
| 13 | 
            +
            from src.auto_leaderboard.get_model_metadata import apply_metadata
         | 
| 14 | 
            +
            from src.assets.text_content import *
         | 
| 15 | 
            +
            from src.elo_leaderboard.load_results import get_elo_plots, get_elo_results_dicts
         | 
| 16 | 
            +
            from src.auto_leaderboard.load_results import get_eval_results_dicts, make_clickable_model
         | 
| 17 | 
            +
            from src.assets.hardcoded_evals import gpt4_values, gpt35_values, baseline
         | 
| 18 | 
            +
            from src.assets.css_html_js import custom_css, get_window_url_params
         | 
| 19 | 
            +
            from src.utils_display import AutoEvalColumn, EvalQueueColumn, EloEvalColumn, fields, styled_error, styled_warning, styled_message
         | 
| 20 | 
            +
            from src.init import load_all_info_from_hub
         | 
| 21 |  | 
| 22 | 
             
            # clone / pull the lmeh eval data
         | 
| 23 | 
             
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| 24 | 
             
            LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
         | 
| 25 | 
             
            HUMAN_EVAL_REPO = "HuggingFaceH4/scale-human-eval"
         | 
| 26 | 
             
            GPT_4_EVAL_REPO = "HuggingFaceH4/open_llm_leaderboard_oai_evals"
         | 
| 27 | 
            +
            IS_PUBLIC = bool(os.environ.get("IS_PUBLIC", True))
         | 
| 28 | 
            +
            ADD_PLOTS = False
         | 
| 29 |  | 
| 30 | 
             
            api = HfApi()
         | 
| 31 |  | 
|  | |
| 35 | 
             
                    repo_id="HuggingFaceH4/open_llm_leaderboard", token=H4_TOKEN
         | 
| 36 | 
             
                )
         | 
| 37 |  | 
| 38 | 
            +
            auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models = load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO)
         | 
| 39 |  | 
| 40 | 
            +
            COLS = [c.name for c in fields(AutoEvalColumn)]
         | 
| 41 | 
            +
            TYPES = [c.type for c in fields(AutoEvalColumn)]
         | 
| 42 | 
            +
            COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default]
         | 
| 43 | 
            +
            TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default]
         | 
| 44 |  | 
| 45 | 
            +
            if not IS_PUBLIC:
         | 
| 46 | 
            +
                COLS.insert(2, AutoEvalColumn.is_8bit.name)
         | 
| 47 | 
            +
                TYPES.insert(2, AutoEvalColumn.is_8bit.type)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 48 |  | 
| 49 | 
            +
            EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
         | 
| 50 | 
            +
            EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
         | 
|  | |
|  | |
|  | |
| 51 |  | 
| 52 | 
            +
            BENCHMARK_COLS = [c.name for c in [AutoEvalColumn.arc, AutoEvalColumn.hellaswag, AutoEvalColumn.mmlu, AutoEvalColumn.truthfulqa]]
         | 
| 53 |  | 
| 54 | 
            +
            ELO_COLS = [c.name for c in fields(EloEvalColumn)]
         | 
| 55 | 
            +
            ELO_TYPES = [c.type for c in fields(EloEvalColumn)]
         | 
| 56 | 
            +
            ELO_SORT_COL = EloEvalColumn.gpt4.name
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 57 |  | 
| 58 |  | 
| 59 | 
             
            def has_no_nan_values(df, columns):
         | 
|  | |
| 65 |  | 
| 66 |  | 
| 67 | 
             
            def get_leaderboard_df():
         | 
| 68 | 
            +
                if auto_eval_repo:
         | 
| 69 | 
             
                    print("Pulling evaluation results for the leaderboard.")
         | 
| 70 | 
            +
                    auto_eval_repo.git_pull()
         | 
| 71 |  | 
| 72 | 
             
                all_data = get_eval_results_dicts(IS_PUBLIC)
         | 
| 73 |  | 
| 74 | 
             
                if not IS_PUBLIC:
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 75 | 
             
                    all_data.append(gpt4_values)
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 76 | 
             
                    all_data.append(gpt35_values)
         | 
| 77 |  | 
| 78 | 
            +
                all_data.append(baseline)
         | 
| 79 | 
            +
                apply_metadata(all_data)  # Populate model type based on known hardcoded values in `metadata.py`
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 80 |  | 
| 81 | 
             
                df = pd.DataFrame.from_records(all_data)
         | 
| 82 | 
            +
                df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
         | 
| 83 | 
             
                df = df[COLS]
         | 
| 84 |  | 
| 85 | 
             
                # filter out if any of the benchmarks have not been produced
         | 
|  | |
| 88 |  | 
| 89 |  | 
| 90 | 
             
            def get_evaluation_queue_df():
         | 
| 91 | 
            +
                # todo @saylortwift: replace the repo by the one you created for the eval queue
         | 
| 92 | 
            +
                if auto_eval_repo:
         | 
| 93 | 
             
                    print("Pulling changes for the evaluation queue.")
         | 
| 94 | 
            +
                    auto_eval_repo.git_pull()
         | 
| 95 |  | 
| 96 | 
             
                entries = [
         | 
| 97 | 
             
                    entry
         | 
| 98 | 
            +
                    for entry in os.listdir("auto_evals/eval_requests")
         | 
| 99 | 
             
                    if not entry.startswith(".")
         | 
| 100 | 
             
                ]
         | 
| 101 | 
             
                all_evals = []
         | 
| 102 |  | 
| 103 | 
             
                for entry in entries:
         | 
| 104 | 
             
                    if ".json" in entry:
         | 
| 105 | 
            +
                        file_path = os.path.join("auto_evals/eval_requests", entry)
         | 
| 106 | 
             
                        with open(file_path) as fp:
         | 
| 107 | 
             
                            data = json.load(fp)
         | 
| 108 |  | 
|  | |
| 115 | 
             
                        # this is a folder
         | 
| 116 | 
             
                        sub_entries = [
         | 
| 117 | 
             
                            e
         | 
| 118 | 
            +
                            for e in os.listdir(f"auto_evals/eval_requests/{entry}")
         | 
| 119 | 
             
                            if not e.startswith(".")
         | 
| 120 | 
             
                        ]
         | 
| 121 | 
             
                        for sub_entry in sub_entries:
         | 
| 122 | 
            +
                            file_path = os.path.join("auto_evals/eval_requests", entry, sub_entry)
         | 
| 123 | 
             
                            with open(file_path) as fp:
         | 
| 124 | 
             
                                data = json.load(fp)
         | 
| 125 |  | 
|  | |
| 191 |  | 
| 192 | 
             
            def is_model_on_hub(model_name, revision) -> bool:
         | 
| 193 | 
             
                try:
         | 
| 194 | 
            +
                    AutoConfig.from_pretrained(model_name, revision=revision)
         | 
| 195 | 
            +
                    return True, None
         | 
| 196 | 
            +
                
         | 
| 197 | 
            +
                except ValueError as e:
         | 
| 198 | 
            +
                    return False, "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard."
         | 
| 199 |  | 
| 200 | 
             
                except Exception as e:
         | 
| 201 | 
            +
                    print("Could not get the model config from the hub.: \n", e)
         | 
| 202 | 
            +
                    return False, "was not found on hub!"
         | 
|  | |
| 203 |  | 
| 204 |  | 
| 205 | 
             
            def add_new_eval(
         | 
|  | |
| 215 | 
             
                # check the model actually exists before adding the eval
         | 
| 216 | 
             
                if revision == "":
         | 
| 217 | 
             
                    revision = "main"
         | 
|  | |
|  | |
|  | |
|  | |
| 218 |  | 
| 219 | 
            +
                if is_delta_weight: 
         | 
| 220 | 
            +
                    base_model_on_hub, error = is_model_on_hub(base_model, revision)
         | 
| 221 | 
            +
                    if not base_model_on_hub:
         | 
| 222 | 
            +
                        return styled_error(f'Base model "{base_model}" {error}')
         | 
| 223 | 
            +
             | 
| 224 | 
            +
                model_on_hub, error = is_model_on_hub(model, revision)
         | 
| 225 | 
            +
                if not model_on_hub:
         | 
| 226 | 
            +
                    return styled_error(f'Model "{model}" {error}')
         | 
| 227 |  | 
| 228 | 
             
                print("adding new eval")
         | 
| 229 |  | 
|  | |
| 244 | 
             
                    user_name = model.split("/")[0]
         | 
| 245 | 
             
                    model_path = model.split("/")[1]
         | 
| 246 |  | 
| 247 | 
            +
                OUT_DIR = f"auto_evals/eval_requests/{user_name}"
         | 
| 248 | 
             
                os.makedirs(OUT_DIR, exist_ok=True)
         | 
| 249 | 
             
                out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
         | 
| 250 |  | 
| 251 | 
             
                # Check for duplicate submission
         | 
| 252 | 
            +
                if out_path.split("eval_requests/")[1].lower() in requested_models:
         | 
| 253 | 
            +
                    return styled_warning("This model has been already submitted.")
         | 
|  | |
| 254 |  | 
| 255 | 
             
                with open(out_path, "w") as f:
         | 
| 256 | 
             
                    f.write(json.dumps(eval_entry))
         | 
|  | |
| 263 | 
             
                    repo_type="dataset",
         | 
| 264 | 
             
                )
         | 
| 265 |  | 
| 266 | 
            +
                return styled_message("Your request has been submitted to the evaluation queue!")
         | 
|  | |
| 267 |  | 
| 268 |  | 
| 269 | 
             
            def refresh():
         | 
|  | |
| 282 |  | 
| 283 |  | 
| 284 | 
             
            def search_table(df, query):
         | 
| 285 | 
            +
                filtered_df = df[df[AutoEvalColumn.dummy.name].str.contains(query, case=False)]
         | 
| 286 | 
             
                return filtered_df
         | 
| 287 |  | 
| 288 |  | 
|  | |
| 300 | 
             
                    return gr.Tabs.update(selected=0)
         | 
| 301 |  | 
| 302 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 303 | 
             
            demo = gr.Blocks(css=custom_css)
         | 
| 304 | 
             
            with demo:
         | 
| 305 | 
             
                gr.HTML(TITLE)
         | 
|  | |
| 328 | 
             
                                    show_label=False,
         | 
| 329 | 
             
                                    elem_id="search-bar",
         | 
| 330 | 
             
                                )
         | 
| 331 | 
            +
                                with gr.Tabs(elem_classes="tab-buttons"):
         | 
| 332 | 
            +
                                    with gr.TabItem("Light View"):
         | 
| 333 | 
            +
                                        leaderboard_table_lite = gr.components.Dataframe(
         | 
| 334 | 
            +
                                            value=leaderboard_df[COLS_LITE],
         | 
| 335 | 
            +
                                            headers=COLS_LITE,
         | 
| 336 | 
            +
                                            datatype=TYPES_LITE,
         | 
| 337 | 
            +
                                            max_rows=None,
         | 
| 338 | 
            +
                                            elem_id="leaderboard-table-lite",
         | 
| 339 | 
            +
                                        )
         | 
| 340 | 
            +
                                    with gr.TabItem("Extended Model View"):
         | 
| 341 | 
            +
                                        leaderboard_table = gr.components.Dataframe(
         | 
| 342 | 
            +
                                            value=leaderboard_df,
         | 
| 343 | 
            +
                                            headers=COLS,
         | 
| 344 | 
            +
                                            datatype=TYPES,
         | 
| 345 | 
            +
                                            max_rows=None,
         | 
| 346 | 
            +
                                            elem_id="leaderboard-table",
         | 
| 347 | 
            +
                                        )
         | 
| 348 |  | 
| 349 | 
             
                                # Dummy leaderboard for handling the case when the user uses backspace key
         | 
| 350 | 
             
                                hidden_leaderboard_table_for_search = gr.components.Dataframe(
         | 
| 351 | 
             
                                    value=original_df,
         | 
| 352 | 
             
                                    headers=COLS,
         | 
| 353 | 
             
                                    datatype=TYPES,
         | 
| 354 | 
            +
                                    max_rows=None,
         | 
| 355 | 
             
                                    visible=False,
         | 
| 356 | 
             
                                )
         | 
|  | |
| 357 | 
             
                                search_bar.submit(
         | 
| 358 | 
             
                                    search_table,
         | 
| 359 | 
             
                                    [hidden_leaderboard_table_for_search, search_bar],
         | 
| 360 | 
             
                                    leaderboard_table,
         | 
| 361 | 
             
                                )
         | 
| 362 |  | 
| 363 | 
            +
                                # Dummy leaderboard for handling the case when the user uses backspace key
         | 
| 364 | 
            +
                                hidden_leaderboard_table_for_search_lite = gr.components.Dataframe(
         | 
| 365 | 
            +
                                    value=original_df[COLS_LITE],
         | 
| 366 | 
            +
                                    headers=COLS_LITE,
         | 
| 367 | 
            +
                                    datatype=TYPES_LITE,
         | 
| 368 | 
            +
                                    max_rows=None,
         | 
| 369 | 
            +
                                    visible=False,
         | 
| 370 | 
            +
                                )
         | 
| 371 | 
            +
                                search_bar.submit(
         | 
| 372 | 
            +
                                    search_table,
         | 
| 373 | 
            +
                                    [hidden_leaderboard_table_for_search_lite, search_bar],
         | 
| 374 | 
            +
                                    leaderboard_table_lite,
         | 
| 375 | 
            +
                                )
         | 
| 376 | 
            +
             | 
| 377 | 
             
                            with gr.Row():
         | 
| 378 | 
             
                                gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
         | 
| 379 |  | 
|  | |
| 457 | 
             
                                gr.Markdown(HUMAN_GPT_EVAL_TEXT, elem_classes="markdown-text")
         | 
| 458 | 
             
                            with gr.Column(scale=1):
         | 
| 459 | 
             
                                gr.Image(
         | 
| 460 | 
            +
                                    "src/assets/scale-hf-logo.png", elem_id="scale-logo", show_label=False
         | 
| 461 | 
             
                                )
         | 
| 462 | 
             
                        gr.Markdown("## No tie allowed")
         | 
| 463 | 
             
                        elo_leaderboard_table = gr.components.Dataframe(
         | 
|  | |
| 492 | 
             
                    tabs,
         | 
| 493 | 
             
                    _js=get_window_url_params,
         | 
| 494 | 
             
                )
         | 
| 495 | 
            +
                if ADD_PLOTS:
         | 
| 496 | 
            +
                    with gr.Box():
         | 
| 497 | 
            +
                        visualization_title = gr.HTML(VISUALIZATION_TITLE)
         | 
| 498 | 
            +
                        with gr.Row():
         | 
| 499 | 
            +
                            with gr.Column():
         | 
| 500 | 
            +
                                gr.Markdown(f"#### Figure 1: {PLOT_1_TITLE}")
         | 
| 501 | 
            +
                                plot_1 = gr.Plot(plot_1, show_label=False)
         | 
| 502 | 
            +
                            with gr.Column():
         | 
| 503 | 
            +
                                gr.Markdown(f"#### Figure 2: {PLOT_2_TITLE}")
         | 
| 504 | 
            +
                                plot_2 = gr.Plot(plot_2, show_label=False)
         | 
| 505 | 
            +
                        with gr.Row():
         | 
| 506 | 
            +
                            with gr.Column():
         | 
| 507 | 
            +
                                gr.Markdown(f"#### Figure 3: {PLOT_3_TITLE}")
         | 
| 508 | 
            +
                                plot_3 = gr.Plot(plot_3, show_label=False)
         | 
| 509 | 
            +
                            with gr.Column():
         | 
| 510 | 
            +
                                gr.Markdown(f"#### Figure 4: {PLOT_4_TITLE}")
         | 
| 511 | 
            +
                                plot_4 = gr.Plot(plot_4, show_label=False)
         | 
| 512 |  | 
| 513 | 
             
            scheduler = BackgroundScheduler()
         | 
| 514 | 
             
            scheduler.add_job(restart_space, "interval", seconds=3600)
         | 
    	
        src/assets/css_html_js.py
    ADDED
    
    | @@ -0,0 +1,87 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            custom_css = """
         | 
| 2 | 
            +
            #changelog-text {
         | 
| 3 | 
            +
                font-size: 16px !important;
         | 
| 4 | 
            +
            }
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            #changelog-text h2 {
         | 
| 7 | 
            +
                font-size: 18px !important;
         | 
| 8 | 
            +
            }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            .markdown-text {
         | 
| 11 | 
            +
                font-size: 16px !important;
         | 
| 12 | 
            +
            }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            #models-to-add-text {
         | 
| 15 | 
            +
                font-size: 18px !important;
         | 
| 16 | 
            +
            }
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            #citation-button span {
         | 
| 19 | 
            +
                font-size: 16px !important;
         | 
| 20 | 
            +
            }
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            #citation-button textarea {
         | 
| 23 | 
            +
                font-size: 16px !important;
         | 
| 24 | 
            +
            }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            #citation-button > label > button {
         | 
| 27 | 
            +
                margin: 6px;
         | 
| 28 | 
            +
                transform: scale(1.3);
         | 
| 29 | 
            +
            }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            #leaderboard-table {
         | 
| 32 | 
            +
                margin-top: 15px
         | 
| 33 | 
            +
            }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            #leaderboard-table-lite {
         | 
| 36 | 
            +
                margin-top: 15px
         | 
| 37 | 
            +
            }
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            #search-bar-table-box > div:first-child {
         | 
| 40 | 
            +
                background: none;
         | 
| 41 | 
            +
                border: none;
         | 
| 42 | 
            +
            }
         | 
| 43 | 
            +
             
         | 
| 44 | 
            +
            #search-bar {
         | 
| 45 | 
            +
                padding: 0px;
         | 
| 46 | 
            +
                width: 30%;
         | 
| 47 | 
            +
            }
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            /* Hides the final AutoEvalColumn */
         | 
| 50 | 
            +
            #llm-benchmark-tab-table table td:last-child,
         | 
| 51 | 
            +
            #llm-benchmark-tab-table table th:last-child {
         | 
| 52 | 
            +
                display: none;
         | 
| 53 | 
            +
            }
         | 
| 54 | 
            +
             | 
| 55 | 
            +
            /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
         | 
| 56 | 
            +
            table td:first-child,
         | 
| 57 | 
            +
            table th:first-child {
         | 
| 58 | 
            +
                max-width: 400px;
         | 
| 59 | 
            +
                overflow: auto;
         | 
| 60 | 
            +
                white-space: nowrap;
         | 
| 61 | 
            +
            }
         | 
| 62 | 
            +
             | 
| 63 | 
            +
            .tab-buttons button {
         | 
| 64 | 
            +
                font-size: 20px;
         | 
| 65 | 
            +
            }
         | 
| 66 | 
            +
             | 
| 67 | 
            +
            #scale-logo {
         | 
| 68 | 
            +
                border-style: none !important;
         | 
| 69 | 
            +
                box-shadow: none;
         | 
| 70 | 
            +
                display: block;
         | 
| 71 | 
            +
                margin-left: auto;
         | 
| 72 | 
            +
                margin-right: auto;
         | 
| 73 | 
            +
                max-width: 600px;
         | 
| 74 | 
            +
            }
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            #scale-logo .download {
         | 
| 77 | 
            +
                display: none;
         | 
| 78 | 
            +
            }
         | 
| 79 | 
            +
            """
         | 
| 80 | 
            +
             | 
| 81 | 
            +
            get_window_url_params = """
         | 
| 82 | 
            +
                function(url_params) {
         | 
| 83 | 
            +
                    const params = new URLSearchParams(window.location.search);
         | 
| 84 | 
            +
                    url_params = Object.fromEntries(params);
         | 
| 85 | 
            +
                    return url_params;
         | 
| 86 | 
            +
                }
         | 
| 87 | 
            +
                """
         | 
    	
        src/assets/hardcoded_evals.py
    ADDED
    
    | @@ -0,0 +1,38 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from src.utils_display import AutoEvalColumn, model_hyperlink
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            gpt4_values = {
         | 
| 4 | 
            +
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt4"),
         | 
| 5 | 
            +
                AutoEvalColumn.revision.name: "tech report",
         | 
| 6 | 
            +
                AutoEvalColumn.is_8bit.name: None,
         | 
| 7 | 
            +
                AutoEvalColumn.average.name: 84.3,
         | 
| 8 | 
            +
                AutoEvalColumn.arc.name: 96.3,
         | 
| 9 | 
            +
                AutoEvalColumn.hellaswag.name:  95.3,
         | 
| 10 | 
            +
                AutoEvalColumn.mmlu.name:  86.4,
         | 
| 11 | 
            +
                AutoEvalColumn.truthfulqa.name:  59.0,
         | 
| 12 | 
            +
                AutoEvalColumn.dummy.name: "GPT-4",
         | 
| 13 | 
            +
            }
         | 
| 14 | 
            +
             | 
| 15 | 
            +
            gpt35_values = {
         | 
| 16 | 
            +
                AutoEvalColumn.model.name: model_hyperlink("https://arxiv.org/abs/2303.08774", "gpt3.5"),
         | 
| 17 | 
            +
                AutoEvalColumn.revision.name: "tech report",
         | 
| 18 | 
            +
                AutoEvalColumn.is_8bit.name: None,
         | 
| 19 | 
            +
                AutoEvalColumn.average.name: 71.9,
         | 
| 20 | 
            +
                AutoEvalColumn.arc.name: 85.2,
         | 
| 21 | 
            +
                AutoEvalColumn.hellaswag.name:  85.5,
         | 
| 22 | 
            +
                AutoEvalColumn.mmlu.name:  70.0,
         | 
| 23 | 
            +
                AutoEvalColumn.truthfulqa.name:  47.0,
         | 
| 24 | 
            +
                AutoEvalColumn.dummy.name: "GPT-3.5",
         | 
| 25 | 
            +
            }
         | 
| 26 | 
            +
             | 
| 27 | 
            +
            baseline = {
         | 
| 28 | 
            +
                AutoEvalColumn.model.name: "<p>Baseline</p>",
         | 
| 29 | 
            +
                AutoEvalColumn.revision.name: "N/A",
         | 
| 30 | 
            +
                AutoEvalColumn.is_8bit.name: None,
         | 
| 31 | 
            +
                AutoEvalColumn.average.name: 25.0,
         | 
| 32 | 
            +
                AutoEvalColumn.arc.name: 25.0,
         | 
| 33 | 
            +
                AutoEvalColumn.hellaswag.name:  25.0,
         | 
| 34 | 
            +
                AutoEvalColumn.mmlu.name:  25.0,
         | 
| 35 | 
            +
                AutoEvalColumn.truthfulqa.name:  25.0,
         | 
| 36 | 
            +
                AutoEvalColumn.dummy.name: "baseline",
         | 
| 37 | 
            +
            }
         | 
| 38 | 
            +
             | 
    	
        scale-hf-logo.png → src/assets/scale-hf-logo.png
    RENAMED
    
    | 
											File without changes
										 | 
    	
        content.py → src/assets/text_content.py
    RENAMED
    
    | @@ -1,4 +1,8 @@ | |
| 1 | 
             
            CHANGELOG_TEXT = f"""
         | 
|  | |
|  | |
|  | |
|  | |
| 2 | 
             
            ## [2023-06-13] 
         | 
| 3 | 
             
            - Adjust description for TruthfulQA
         | 
| 4 |  | 
| @@ -13,7 +17,7 @@ CHANGELOG_TEXT = f""" | |
| 13 | 
             
            - Add a typeahead search bar
         | 
| 14 | 
             
            - Use webhooks to automatically spawn a new Space when someone opens a PR
         | 
| 15 | 
             
            - Start recording `submitted_time` for eval requests
         | 
| 16 | 
            -
            - Limit  | 
| 17 |  | 
| 18 | 
             
            ## [2023-05-30] 
         | 
| 19 | 
             
            - Add a citation button
         | 
|  | |
| 1 | 
             
            CHANGELOG_TEXT = f"""
         | 
| 2 | 
            +
            ## [2023-06-16]
         | 
| 3 | 
            +
            - Refactored code base
         | 
| 4 | 
            +
            - Added new columns: number of parameters, hub likes, license
         | 
| 5 | 
            +
             | 
| 6 | 
             
            ## [2023-06-13] 
         | 
| 7 | 
             
            - Adjust description for TruthfulQA
         | 
| 8 |  | 
|  | |
| 17 | 
             
            - Add a typeahead search bar
         | 
| 18 | 
             
            - Use webhooks to automatically spawn a new Space when someone opens a PR
         | 
| 19 | 
             
            - Start recording `submitted_time` for eval requests
         | 
| 20 | 
            +
            - Limit AutoEvalColumn max-width
         | 
| 21 |  | 
| 22 | 
             
            ## [2023-05-30] 
         | 
| 23 | 
             
            - Add a citation button
         | 
    	
        src/auto_leaderboard/get_model_metadata.py
    ADDED
    
    | @@ -0,0 +1,54 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import re
         | 
| 2 | 
            +
            from typing import List
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            from src.utils_display import AutoEvalColumn
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            from huggingface_hub import HfApi
         | 
| 7 | 
            +
            import huggingface_hub
         | 
| 8 | 
            +
            api = HfApi()
         | 
| 9 | 
            +
             | 
| 10 | 
            +
             | 
| 11 | 
            +
            def get_model_infos_from_hub(leaderboard_data: List[dict]):
         | 
| 12 | 
            +
                for model_data in leaderboard_data:
         | 
| 13 | 
            +
                    model_name = model_data["model_name_for_query"]
         | 
| 14 | 
            +
                    try:
         | 
| 15 | 
            +
                        model_info = api.model_info(model_name)
         | 
| 16 | 
            +
                    except huggingface_hub.utils._errors.RepositoryNotFoundError:
         | 
| 17 | 
            +
                        model_data[AutoEvalColumn.license.name] = None
         | 
| 18 | 
            +
                        model_data[AutoEvalColumn.likes.name] = None
         | 
| 19 | 
            +
                        model_data[AutoEvalColumn.params.name] = None
         | 
| 20 | 
            +
                        continue
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                    model_data[AutoEvalColumn.license.name] = get_model_license(model_info)
         | 
| 23 | 
            +
                    model_data[AutoEvalColumn.likes.name] = get_model_likes(model_info)
         | 
| 24 | 
            +
                    model_data[AutoEvalColumn.params.name] = get_model_size(model_name, model_info)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
             | 
| 27 | 
            +
            def get_model_license(model_info):
         | 
| 28 | 
            +
                try:
         | 
| 29 | 
            +
                    return model_info.cardData["license"]
         | 
| 30 | 
            +
                except Exception:
         | 
| 31 | 
            +
                    return None
         | 
| 32 | 
            +
             | 
| 33 | 
            +
            def get_model_likes(model_info):
         | 
| 34 | 
            +
                return model_info.likes
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            size_pattern = re.compile(r"\d+(b|m)")
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            def get_model_size(model_name, model_info):
         | 
| 39 | 
            +
                # In billions
         | 
| 40 | 
            +
                try:
         | 
| 41 | 
            +
                    return model_info.safetensors["total"] / 1e9
         | 
| 42 | 
            +
                except AttributeError:
         | 
| 43 | 
            +
                    #print(f"Repository {model_id} does not have safetensors weights")
         | 
| 44 | 
            +
                    pass
         | 
| 45 | 
            +
                try:
         | 
| 46 | 
            +
                    size_match = re.search(size_pattern, model_name.lower())
         | 
| 47 | 
            +
                    size = size_match.group(0)
         | 
| 48 | 
            +
                    return int(size[:-1]) if size[-1] == "b" else int(size[:-1]) / 1e3
         | 
| 49 | 
            +
                except AttributeError:
         | 
| 50 | 
            +
                    return None
         | 
| 51 | 
            +
             | 
| 52 | 
            +
             | 
| 53 | 
            +
            def apply_metadata(leaderboard_data: List[dict]):
         | 
| 54 | 
            +
                get_model_infos_from_hub(leaderboard_data)
         | 
    	
        utils.py → src/auto_leaderboard/load_results.py
    RENAMED
    
    | @@ -1,47 +1,23 @@ | |
|  | |
|  | |
| 1 | 
             
            import glob
         | 
| 2 | 
             
            import json
         | 
| 3 | 
            -
            from dataclasses import dataclass
         | 
| 4 | 
             
            from typing import Dict, List, Tuple
         | 
| 5 |  | 
|  | |
| 6 | 
             
            import numpy as np
         | 
| 7 |  | 
| 8 | 
             
            # clone / pull the lmeh eval data
         | 
| 9 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 10 | 
             
            BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
         | 
| 11 | 
             
            BENCH_TO_NAME = {
         | 
| 12 | 
            -
                "arc_challenge":  | 
| 13 | 
            -
                "hellaswag":  | 
| 14 | 
            -
                "hendrycks":  | 
| 15 | 
            -
                "truthfulqa_mc":  | 
| 16 | 
             
            }
         | 
| 17 |  | 
| 18 |  | 
| 19 | 
            -
            def make_clickable_model(model_name):
         | 
| 20 | 
            -
                LLAMAS = [
         | 
| 21 | 
            -
                    "huggingface/llama-7b",
         | 
| 22 | 
            -
                    "huggingface/llama-13b",
         | 
| 23 | 
            -
                    "huggingface/llama-30b",
         | 
| 24 | 
            -
                    "huggingface/llama-65b",
         | 
| 25 | 
            -
                ]
         | 
| 26 | 
            -
                if model_name in LLAMAS:
         | 
| 27 | 
            -
                    model = model_name.split("/")[1]
         | 
| 28 | 
            -
                    return f'<a target="_blank" href="https://ai.facebook.com/blog/large-language-model-llama-meta-ai/" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model}</a>'
         | 
| 29 | 
            -
             | 
| 30 | 
            -
                if model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
         | 
| 31 | 
            -
                    link = "https://huggingface.co/" + "CarperAI/stable-vicuna-13b-delta"
         | 
| 32 | 
            -
                    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">stable-vicuna-13b</a>'
         | 
| 33 | 
            -
             | 
| 34 | 
            -
                if model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
         | 
| 35 | 
            -
                    link = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
         | 
| 36 | 
            -
                    return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">alpaca-13b</a>'
         | 
| 37 | 
            -
             | 
| 38 | 
            -
                # remove user from model name
         | 
| 39 | 
            -
                # model_name_show = ' '.join(model_name.split('/')[1:])
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                link = "https://huggingface.co/" + model_name
         | 
| 42 | 
            -
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 43 | 
            -
             | 
| 44 | 
            -
             | 
| 45 | 
             
            @dataclass
         | 
| 46 | 
             
            class EvalResult:
         | 
| 47 | 
             
                eval_name: str
         | 
| @@ -58,12 +34,12 @@ class EvalResult: | |
| 58 | 
             
                        base_model = f"{self.model}"
         | 
| 59 | 
             
                    data_dict = {}
         | 
| 60 |  | 
| 61 | 
            -
                    data_dict["eval_name"] = self.eval_name
         | 
| 62 | 
            -
                    data_dict[ | 
| 63 | 
            -
                    data_dict[ | 
| 64 | 
            -
                    data_dict[ | 
| 65 | 
            -
                    data_dict[ | 
| 66 | 
            -
                    data_dict[ | 
| 67 | 
             
                        sum([v for k, v in self.results.items()]) / 4.0, 1
         | 
| 68 | 
             
                    )
         | 
| 69 |  | 
| @@ -88,17 +64,15 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: | |
| 88 | 
             
                revision = path_split[-3]
         | 
| 89 | 
             
                if len(path_split) == 7:
         | 
| 90 | 
             
                    # handles gpt2 type models that don't have an org
         | 
| 91 | 
            -
                    result_key = f"{ | 
| 92 | 
             
                else:
         | 
| 93 | 
            -
                    result_key = (
         | 
| 94 | 
            -
                        f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
         | 
| 95 | 
            -
                    )
         | 
| 96 | 
             
                    org = path_split[-5]
         | 
|  | |
| 97 |  | 
| 98 | 
             
                eval_result = None
         | 
| 99 | 
             
                for benchmark, metric in zip(BENCHMARKS, METRICS):
         | 
| 100 | 
             
                    if benchmark in json_filepath:
         | 
| 101 | 
            -
                        accs = np.array([v[metric] for  | 
| 102 | 
             
                        mean_acc = round(np.mean(accs) * 100.0, 1)
         | 
| 103 | 
             
                        eval_result = EvalResult(
         | 
| 104 | 
             
                            result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
         | 
| @@ -109,18 +83,19 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]: | |
| 109 |  | 
| 110 | 
             
            def get_eval_results(is_public) -> List[EvalResult]:
         | 
| 111 | 
             
                json_filepaths = glob.glob(
         | 
| 112 | 
            -
                    " | 
| 113 | 
             
                )
         | 
| 114 | 
             
                if not is_public:
         | 
| 115 | 
             
                    json_filepaths += glob.glob(
         | 
| 116 | 
            -
                        " | 
| 117 | 
             
                    )
         | 
| 118 | 
             
                    json_filepaths += glob.glob(
         | 
| 119 | 
            -
                        " | 
| 120 | 
             
                    )
         | 
|  | |
| 121 | 
             
                    json_filepaths += glob.glob(
         | 
| 122 | 
            -
                        " | 
| 123 | 
            -
                    )   | 
| 124 | 
             
                eval_results = {}
         | 
| 125 |  | 
| 126 | 
             
                for json_filepath in json_filepaths:
         | 
| @@ -130,7 +105,7 @@ def get_eval_results(is_public) -> List[EvalResult]: | |
| 130 | 
             
                    else:
         | 
| 131 | 
             
                        eval_results[result_key] = eval_result
         | 
| 132 |  | 
| 133 | 
            -
                eval_results = [v for  | 
| 134 |  | 
| 135 | 
             
                return eval_results
         | 
| 136 |  | 
| @@ -139,12 +114,3 @@ def get_eval_results_dicts(is_public=True) -> List[Dict]: | |
| 139 | 
             
                eval_results = get_eval_results(is_public)
         | 
| 140 |  | 
| 141 | 
             
                return [e.to_dict() for e in eval_results]
         | 
| 142 | 
            -
             | 
| 143 | 
            -
             | 
| 144 | 
            -
            get_window_url_params = """
         | 
| 145 | 
            -
                function(url_params) {
         | 
| 146 | 
            -
                    const params = new URLSearchParams(window.location.search);
         | 
| 147 | 
            -
                    url_params = Object.fromEntries(params);
         | 
| 148 | 
            -
                    return url_params;
         | 
| 149 | 
            -
                }
         | 
| 150 | 
            -
                """
         | 
|  | |
| 1 | 
            +
            from dataclasses import dataclass
         | 
| 2 | 
            +
             | 
| 3 | 
             
            import glob
         | 
| 4 | 
             
            import json
         | 
|  | |
| 5 | 
             
            from typing import Dict, List, Tuple
         | 
| 6 |  | 
| 7 | 
            +
            from src.utils_display import AutoEvalColumn, make_clickable_model
         | 
| 8 | 
             
            import numpy as np
         | 
| 9 |  | 
| 10 | 
             
            # clone / pull the lmeh eval data
         | 
| 11 | 
             
            METRICS = ["acc_norm", "acc_norm", "acc_norm", "mc2"]
         | 
| 12 | 
             
            BENCHMARKS = ["arc_challenge", "hellaswag", "hendrycks", "truthfulqa_mc"]
         | 
| 13 | 
             
            BENCH_TO_NAME = {
         | 
| 14 | 
            +
                "arc_challenge": AutoEvalColumn.arc.name,
         | 
| 15 | 
            +
                "hellaswag": AutoEvalColumn.hellaswag.name,
         | 
| 16 | 
            +
                "hendrycks": AutoEvalColumn.mmlu.name,
         | 
| 17 | 
            +
                "truthfulqa_mc": AutoEvalColumn.truthfulqa.name,
         | 
| 18 | 
             
            }
         | 
| 19 |  | 
| 20 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 21 | 
             
            @dataclass
         | 
| 22 | 
             
            class EvalResult:
         | 
| 23 | 
             
                eval_name: str
         | 
|  | |
| 34 | 
             
                        base_model = f"{self.model}"
         | 
| 35 | 
             
                    data_dict = {}
         | 
| 36 |  | 
| 37 | 
            +
                    data_dict["eval_name"] = self.eval_name # not a column, just a save name
         | 
| 38 | 
            +
                    data_dict[AutoEvalColumn.is_8bit.name] = self.is_8bit
         | 
| 39 | 
            +
                    data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
         | 
| 40 | 
            +
                    data_dict[AutoEvalColumn.dummy.name] = base_model
         | 
| 41 | 
            +
                    data_dict[AutoEvalColumn.revision.name] = self.revision
         | 
| 42 | 
            +
                    data_dict[AutoEvalColumn.average.name] = round(
         | 
| 43 | 
             
                        sum([v for k, v in self.results.items()]) / 4.0, 1
         | 
| 44 | 
             
                    )
         | 
| 45 |  | 
|  | |
| 64 | 
             
                revision = path_split[-3]
         | 
| 65 | 
             
                if len(path_split) == 7:
         | 
| 66 | 
             
                    # handles gpt2 type models that don't have an org
         | 
| 67 | 
            +
                    result_key = f"{model}_{revision}_{is_8bit}"
         | 
| 68 | 
             
                else:
         | 
|  | |
|  | |
|  | |
| 69 | 
             
                    org = path_split[-5]
         | 
| 70 | 
            +
                    result_key =  f"{org}_{model}_{revision}_{is_8bit}"
         | 
| 71 |  | 
| 72 | 
             
                eval_result = None
         | 
| 73 | 
             
                for benchmark, metric in zip(BENCHMARKS, METRICS):
         | 
| 74 | 
             
                    if benchmark in json_filepath:
         | 
| 75 | 
            +
                        accs = np.array([v[metric] for v in data["results"].values()])
         | 
| 76 | 
             
                        mean_acc = round(np.mean(accs) * 100.0, 1)
         | 
| 77 | 
             
                        eval_result = EvalResult(
         | 
| 78 | 
             
                            result_key, org, model, revision, is_8bit, {benchmark: mean_acc}
         | 
|  | |
| 83 |  | 
| 84 | 
             
            def get_eval_results(is_public) -> List[EvalResult]:
         | 
| 85 | 
             
                json_filepaths = glob.glob(
         | 
| 86 | 
            +
                    "auto_evals/eval_results/public/**/16bit/*.json", recursive=True
         | 
| 87 | 
             
                )
         | 
| 88 | 
             
                if not is_public:
         | 
| 89 | 
             
                    json_filepaths += glob.glob(
         | 
| 90 | 
            +
                        "auto_evals/eval_results/private/**/*.json", recursive=True
         | 
| 91 | 
             
                    )
         | 
| 92 | 
             
                    json_filepaths += glob.glob(
         | 
| 93 | 
            +
                        "auto_evals/eval_results/private/**/*.json", recursive=True
         | 
| 94 | 
             
                    )
         | 
| 95 | 
            +
                    # include the 8bit evals of public models
         | 
| 96 | 
             
                    json_filepaths += glob.glob(
         | 
| 97 | 
            +
                        "auto_evals/eval_results/public/**/8bit/*.json", recursive=True
         | 
| 98 | 
            +
                    )  
         | 
| 99 | 
             
                eval_results = {}
         | 
| 100 |  | 
| 101 | 
             
                for json_filepath in json_filepaths:
         | 
|  | |
| 105 | 
             
                    else:
         | 
| 106 | 
             
                        eval_results[result_key] = eval_result
         | 
| 107 |  | 
| 108 | 
            +
                eval_results = [v for v in eval_results.values()]
         | 
| 109 |  | 
| 110 | 
             
                return eval_results
         | 
| 111 |  | 
|  | |
| 114 | 
             
                eval_results = get_eval_results(is_public)
         | 
| 115 |  | 
| 116 | 
             
                return [e.to_dict() for e in eval_results]
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
    	
        elo_utils.py → src/elo_leaderboard/load_results.py
    RENAMED
    
    | @@ -6,9 +6,9 @@ import numpy as np | |
| 6 | 
             
            import pandas as pd
         | 
| 7 | 
             
            from datasets import load_dataset
         | 
| 8 |  | 
| 9 | 
            -
            from  | 
| 10 | 
            -
            from  | 
| 11 | 
            -
            from visualizations import (
         | 
| 12 | 
             
                get_bootstrap_result,
         | 
| 13 | 
             
                switch_model_a_b,
         | 
| 14 | 
             
                visualize_battle_count,
         | 
| @@ -18,29 +18,6 @@ from visualizations import ( | |
| 18 | 
             
            )
         | 
| 19 |  | 
| 20 |  | 
| 21 | 
            -
            KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
         | 
| 22 | 
            -
            VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
         | 
| 23 | 
            -
            OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
         | 
| 24 | 
            -
            DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
         | 
| 25 | 
            -
            MODEL_PAGE = "https://huggingface.co/models"
         | 
| 26 | 
            -
             | 
| 27 | 
            -
             | 
| 28 | 
            -
            def make_clickable_model_elo(model_name):
         | 
| 29 | 
            -
                link = ""
         | 
| 30 | 
            -
                if model_name == "dolly-12b":
         | 
| 31 | 
            -
                    link = DOLLY_LINK
         | 
| 32 | 
            -
                elif model_name == "vicuna-13b":
         | 
| 33 | 
            -
                    link = VICUNA_LINK
         | 
| 34 | 
            -
                elif model_name == "koala-13b":
         | 
| 35 | 
            -
                    link = KOALA_LINK
         | 
| 36 | 
            -
                elif model_name == "oasst-12b":
         | 
| 37 | 
            -
                    link = OASST_LINK
         | 
| 38 | 
            -
                else:
         | 
| 39 | 
            -
                    link = MODEL_PAGE
         | 
| 40 | 
            -
             | 
| 41 | 
            -
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 42 | 
            -
             | 
| 43 | 
            -
             | 
| 44 | 
             
            @dataclass
         | 
| 45 | 
             
            class EloEvalResult:
         | 
| 46 | 
             
                model: str
         | 
| @@ -53,11 +30,11 @@ class EloEvalResult: | |
| 53 | 
             
                def to_dict(self):
         | 
| 54 | 
             
                    base_model = f"{self.model}"
         | 
| 55 | 
             
                    data_dict = {}
         | 
| 56 | 
            -
                    data_dict[ | 
| 57 | 
            -
                    data_dict[ | 
| 58 | 
            -
                    data_dict[ | 
| 59 | 
            -
                    data_dict[ | 
| 60 | 
            -
                    data_dict[ | 
| 61 |  | 
| 62 | 
             
                    return data_dict
         | 
| 63 |  | 
|  | |
| 6 | 
             
            import pandas as pd
         | 
| 7 | 
             
            from datasets import load_dataset
         | 
| 8 |  | 
| 9 | 
            +
            from src.assets.text_content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
         | 
| 10 | 
            +
            from src.utils_display import make_clickable_model, EloEvalColumn
         | 
| 11 | 
            +
            from .visualizations import (
         | 
| 12 | 
             
                get_bootstrap_result,
         | 
| 13 | 
             
                switch_model_a_b,
         | 
| 14 | 
             
                visualize_battle_count,
         | 
|  | |
| 18 | 
             
            )
         | 
| 19 |  | 
| 20 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 21 | 
             
            @dataclass
         | 
| 22 | 
             
            class EloEvalResult:
         | 
| 23 | 
             
                model: str
         | 
|  | |
| 30 | 
             
                def to_dict(self):
         | 
| 31 | 
             
                    base_model = f"{self.model}"
         | 
| 32 | 
             
                    data_dict = {}
         | 
| 33 | 
            +
                    data_dict[EloEvalColumn.model.name] = make_clickable_model(base_model)
         | 
| 34 | 
            +
                    data_dict[EloEvalColumn.gpt4.name] = self.gpt_4_all
         | 
| 35 | 
            +
                    data_dict[EloEvalColumn.human_all.name] = self.human_all
         | 
| 36 | 
            +
                    data_dict[EloEvalColumn.human_instruct.name] = self.human_instruct
         | 
| 37 | 
            +
                    data_dict[EloEvalColumn.human_code_instruct.name] = self.human_code_instruct
         | 
| 38 |  | 
| 39 | 
             
                    return data_dict
         | 
| 40 |  | 
    	
        visualizations.py → src/elo_leaderboard/visualizations.py
    RENAMED
    
    | @@ -133,5 +133,5 @@ def visualize_rating_count(df, title): | |
| 133 | 
             
                fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
         | 
| 134 | 
             
                fig.update_yaxes(range=[y_begin, y_end])
         | 
| 135 | 
             
                # save the plot for the blog:
         | 
| 136 | 
            -
                fig.write_html("model_counts.html", full_html=False, include_plotlyjs="cdn")
         | 
| 137 | 
             
                return fig
         | 
|  | |
| 133 | 
             
                fig.update_layout(xaxis_title="model", yaxis_title="Rating Count", showlegend=False)
         | 
| 134 | 
             
                fig.update_yaxes(range=[y_begin, y_end])
         | 
| 135 | 
             
                # save the plot for the blog:
         | 
| 136 | 
            +
                fig.write_html("src/assets/model_counts.html", full_html=False, include_plotlyjs="cdn")
         | 
| 137 | 
             
                return fig
         | 
    	
        src/init.py
    ADDED
    
    | @@ -0,0 +1,73 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import os 
         | 
| 2 | 
            +
            from huggingface_hub import Repository
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            H4_TOKEN = os.environ.get("H4_TOKEN", None)
         | 
| 5 | 
            +
             | 
| 6 | 
            +
             | 
| 7 | 
            +
            def get_all_requested_models(requested_models_dir):
         | 
| 8 | 
            +
                depth = 1
         | 
| 9 | 
            +
                file_names = []
         | 
| 10 | 
            +
             | 
| 11 | 
            +
                for root, dirs, files in os.walk(requested_models_dir):
         | 
| 12 | 
            +
                    current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
         | 
| 13 | 
            +
                    if current_depth == depth:
         | 
| 14 | 
            +
                        file_names.extend([os.path.join(root, file) for file in files])
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                return set([file_name.lower().split("eval_requests/")[1] for file_name in file_names])
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            def load_all_info_from_hub(LMEH_REPO, HUMAN_EVAL_REPO, GPT_4_EVAL_REPO):
         | 
| 19 | 
            +
                auto_eval_repo = None
         | 
| 20 | 
            +
                requested_models = None
         | 
| 21 | 
            +
                if H4_TOKEN:
         | 
| 22 | 
            +
                    print("Pulling evaluation requests and results.")
         | 
| 23 | 
            +
                    # try:
         | 
| 24 | 
            +
                    #     shutil.rmtree("./auto_evals/")
         | 
| 25 | 
            +
                    # except:
         | 
| 26 | 
            +
                    #     pass
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    auto_eval_repo = Repository(
         | 
| 29 | 
            +
                        local_dir="./auto_evals/",
         | 
| 30 | 
            +
                        clone_from=LMEH_REPO,
         | 
| 31 | 
            +
                        use_auth_token=H4_TOKEN,
         | 
| 32 | 
            +
                        repo_type="dataset",
         | 
| 33 | 
            +
                    )
         | 
| 34 | 
            +
                    auto_eval_repo.git_pull()
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    requested_models_dir = "./auto_evals/eval_requests"
         | 
| 37 | 
            +
                    requested_models = get_all_requested_models(requested_models_dir)
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                human_eval_repo = None
         | 
| 40 | 
            +
                if H4_TOKEN and not os.path.isdir("./human_evals"):
         | 
| 41 | 
            +
                    print("Pulling human evaluation repo")
         | 
| 42 | 
            +
                    human_eval_repo = Repository(
         | 
| 43 | 
            +
                        local_dir="./human_evals/",
         | 
| 44 | 
            +
                        clone_from=HUMAN_EVAL_REPO,
         | 
| 45 | 
            +
                        use_auth_token=H4_TOKEN,
         | 
| 46 | 
            +
                        repo_type="dataset",
         | 
| 47 | 
            +
                    )
         | 
| 48 | 
            +
                    human_eval_repo.git_pull()
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                gpt_4_eval_repo = None
         | 
| 51 | 
            +
                if H4_TOKEN and not os.path.isdir("./gpt_4_evals"):
         | 
| 52 | 
            +
                    print("Pulling GPT-4 evaluation repo")
         | 
| 53 | 
            +
                    gpt_4_eval_repo = Repository(
         | 
| 54 | 
            +
                        local_dir="./gpt_4_evals/",
         | 
| 55 | 
            +
                        clone_from=GPT_4_EVAL_REPO,
         | 
| 56 | 
            +
                        use_auth_token=H4_TOKEN,
         | 
| 57 | 
            +
                        repo_type="dataset",
         | 
| 58 | 
            +
                    )
         | 
| 59 | 
            +
                    gpt_4_eval_repo.git_pull()
         | 
| 60 | 
            +
             | 
| 61 | 
            +
                return auto_eval_repo, human_eval_repo, gpt_4_eval_repo, requested_models
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
            #def load_results(model, benchmark, metric):
         | 
| 65 | 
            +
            #    file_path = os.path.join("autoevals", model, f"{model}-eval_{benchmark}.json")
         | 
| 66 | 
            +
            #    if not os.path.exists(file_path):
         | 
| 67 | 
            +
            #        return 0.0, None
         | 
| 68 | 
            +
             | 
| 69 | 
            +
            #    with open(file_path) as fp:
         | 
| 70 | 
            +
            #        data = json.load(fp)
         | 
| 71 | 
            +
            #    accs = np.array([v[metric] for k, v in data["results"].items()])
         | 
| 72 | 
            +
            #    mean_acc = np.mean(accs)
         | 
| 73 | 
            +
            #    return mean_acc, data["config"]["model_args"]
         | 
    	
        src/utils_display.py
    ADDED
    
    | @@ -0,0 +1,96 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            from dataclasses import dataclass
         | 
| 2 | 
            +
             | 
| 3 | 
            +
            # These classes are for user facing column names, to avoid having to change them 
         | 
| 4 | 
            +
            # all around the code when a modif is needed 
         | 
| 5 | 
            +
            @dataclass
         | 
| 6 | 
            +
            class ColumnContent:
         | 
| 7 | 
            +
                name: str
         | 
| 8 | 
            +
                type: str 
         | 
| 9 | 
            +
                displayed_by_default: bool 
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            def fields(raw_class):
         | 
| 12 | 
            +
                return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            @dataclass(frozen=True)
         | 
| 15 | 
            +
            class AutoEvalColumn: # Auto evals column
         | 
| 16 | 
            +
                model = ColumnContent("Model", "markdown", True)
         | 
| 17 | 
            +
                revision = ColumnContent("Revision", "str", True)
         | 
| 18 | 
            +
                is_8bit = ColumnContent("8bit", "bool", False)
         | 
| 19 | 
            +
                license = ColumnContent("Hub License", "str", False)
         | 
| 20 | 
            +
                params = ColumnContent("#Params (B)", "number", False)
         | 
| 21 | 
            +
                likes = ColumnContent("Hub ❤️", "number", False)
         | 
| 22 | 
            +
                average = ColumnContent("Average ⬆️", "number", True)
         | 
| 23 | 
            +
                arc = ColumnContent("ARC (25-s) ⬆️", "number", True)
         | 
| 24 | 
            +
                hellaswag = ColumnContent("HellaSwag (10-s) ⬆️", "number", True)
         | 
| 25 | 
            +
                mmlu = ColumnContent("MMLU (5-s) ⬆️", "number", True)
         | 
| 26 | 
            +
                truthfulqa = ColumnContent("TruthfulQA (MC) (0-s) ⬆️", "number", True)
         | 
| 27 | 
            +
                dummy = ColumnContent("model_name_for_query", "str", True) # dummy col to implement search bar (hidden by custom CSS)
         | 
| 28 | 
            +
             | 
| 29 | 
            +
            @dataclass(frozen=True)
         | 
| 30 | 
            +
            class EloEvalColumn: # Elo evals column
         | 
| 31 | 
            +
                model = ColumnContent("Model", "markdown", True)
         | 
| 32 | 
            +
                gpt4 = ColumnContent("GPT-4 (all)", "number", True)
         | 
| 33 | 
            +
                human_all = ColumnContent("Human (all)", "number", True)
         | 
| 34 | 
            +
                human_instruct = ColumnContent("Human (instruct)", "number", True)
         | 
| 35 | 
            +
                human_code_instruct = ColumnContent("Human (code-instruct)", "number", True)
         | 
| 36 | 
            +
             | 
| 37 | 
            +
             | 
| 38 | 
            +
            @dataclass(frozen=True)
         | 
| 39 | 
            +
            class EvalQueueColumn: # Queue column
         | 
| 40 | 
            +
                model = ColumnContent("model", "markdown", True)
         | 
| 41 | 
            +
                revision = ColumnContent("revision", "str", True)
         | 
| 42 | 
            +
                private = ColumnContent("private", "bool", True)
         | 
| 43 | 
            +
                is_8bit = ColumnContent("8bit_eval", "bool", True)
         | 
| 44 | 
            +
                has_delta_weight = ColumnContent("is_delta_weight", "bool", True)
         | 
| 45 | 
            +
                status = ColumnContent("status", "str", True)
         | 
| 46 | 
            +
             | 
| 47 | 
            +
            LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]
         | 
| 48 | 
            +
             | 
| 49 | 
            +
             | 
| 50 | 
            +
            KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
         | 
| 51 | 
            +
            VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
         | 
| 52 | 
            +
            OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
         | 
| 53 | 
            +
            DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
         | 
| 54 | 
            +
            MODEL_PAGE = "https://huggingface.co/models"
         | 
| 55 | 
            +
            LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
         | 
| 56 | 
            +
            VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
         | 
| 57 | 
            +
            ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
         | 
| 58 | 
            +
             | 
| 59 | 
            +
             | 
| 60 | 
            +
            def model_hyperlink(link, model_name):
         | 
| 61 | 
            +
                return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
         | 
| 62 | 
            +
             | 
| 63 | 
            +
             | 
| 64 | 
            +
            def make_clickable_model(model_name):
         | 
| 65 | 
            +
                link = f"https://huggingface.co/{model_name}"
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                if model_name in LLAMAS:
         | 
| 68 | 
            +
                    link = LLAMA_LINK
         | 
| 69 | 
            +
                    model_name = model_name.split("/")[1]
         | 
| 70 | 
            +
                elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
         | 
| 71 | 
            +
                    link = VICUNA_LINK
         | 
| 72 | 
            +
                    model_name = "stable-vicuna-13b"
         | 
| 73 | 
            +
                elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
         | 
| 74 | 
            +
                    link = ALPACA_LINK
         | 
| 75 | 
            +
                    model_name = "alpaca-13b"
         | 
| 76 | 
            +
                if model_name == "dolly-12b":
         | 
| 77 | 
            +
                    link = DOLLY_LINK
         | 
| 78 | 
            +
                elif model_name == "vicuna-13b":
         | 
| 79 | 
            +
                    link = VICUNA_LINK
         | 
| 80 | 
            +
                elif model_name == "koala-13b":
         | 
| 81 | 
            +
                    link = KOALA_LINK
         | 
| 82 | 
            +
                elif model_name == "oasst-12b":
         | 
| 83 | 
            +
                    link = OASST_LINK
         | 
| 84 | 
            +
                #else:
         | 
| 85 | 
            +
                #    link = MODEL_PAGE
         | 
| 86 | 
            +
              
         | 
| 87 | 
            +
                return model_hyperlink(link, model_name)
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            def styled_error(error):
         | 
| 90 | 
            +
                return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            def styled_warning(warn):
         | 
| 93 | 
            +
                return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
         | 
| 94 | 
            +
             | 
| 95 | 
            +
            def styled_message(message):
         | 
| 96 | 
            +
                return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
         | 
