Spaces:

HuggingFaceH4
/

open_llm_leaderboard

Restarting on CPU Upgrade

App Files Files Community

751

lewtun HF staff commited on May 16, 2023

Commit

614ee1f

•

1 Parent(s): 7eee6bd

Fix TruthQA typo

Browse files

Files changed (2) hide show

app.py +44 -46
utils.py +1 -1

app.py CHANGED Viewed

@@ -43,11 +43,11 @@ def load_results(model, benchmark, metric):
     with open(file_path) as fp:
         data = json.load(fp)
     accs = np.array([v[metric] for k, v in data["results"].items()])
-    mean_acc = np.mean(accs)
     return mean_acc, data["config"]["model_args"]
-COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
 TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
 if not IS_PUBLIC:
@@ -57,36 +57,36 @@ if not IS_PUBLIC:
 EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
 EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
 def get_leaderboard():
-    if repo:
         print("pulling changes")
         repo.git_pull()
     all_data = get_eval_results_dicts(IS_PUBLIC)
     if not IS_PUBLIC:
         gpt4_values = {
-            "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
-            "Revision":"tech report",
             "8bit":None,
             "Average ⬆️":84.3,
             "ARC (25-shot) ⬆️":96.3,
             "HellaSwag (10-shot) ⬆️":95.3,
             "MMLU (5-shot) ⬆️":86.4,
-            "TruthQA (0-shot) ⬆️":59.0,
         }
         all_data.append(gpt4_values)
         gpt35_values = {
-            "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
-            "Revision":"tech report",
             "8bit":None,
             "Average ⬆️":71.9,
             "ARC (25-shot) ⬆️":85.2,
             "HellaSwag (10-shot) ⬆️":85.5,
             "MMLU (5-shot) ⬆️":70.0,
-            "TruthQA (0-shot) ⬆️":47.0,
         }
         all_data.append(gpt35_values)
     dataframe = pd.DataFrame.from_records(all_data)
     dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
     print(dataframe)
@@ -94,38 +94,38 @@ def get_leaderboard():
     return dataframe
 def get_eval_table():
-    if repo:
         print("pulling changes for eval")
         repo.git_pull()
-    entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
     all_evals = []
     for entry in entries:
         print(entry)
         if ".json"in entry:
             file_path = os.path.join("evals/eval_requests", entry)
             with open(file_path) as fp:
                 data = json.load(fp)
             data["# params"] = "unknown"
             data["model"] = make_clickable_model(data["model"])
             data["revision"] = data.get("revision", "main")
             all_evals.append(data)
         else:
             # this is a folder
-            sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
             for sub_entry in sub_entries:
                 file_path = os.path.join("evals/eval_requests", entry, sub_entry)
                 with open(file_path) as fp:
                     data = json.load(fp)
                 #data["# params"] = get_n_params(data["model"])
                 data["model"] = make_clickable_model(data["model"])
                 all_evals.append(data)
     dataframe = pd.DataFrame.from_records(all_evals)
     return dataframe[EVAL_COLS]
@@ -137,12 +137,12 @@ def is_model_on_hub(model_name, revision) -> bool:
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision)
         return True
     except Exception as e:
         print("Could not get the model config from the hub")
         print(e)
         return False
 def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
@@ -152,12 +152,12 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
     if is_delta_weight and not is_model_on_hub(base_model, revision):
         print(base_model, "base model not found on hub")
         return
     if not is_model_on_hub(model, revision):
         print(model, "not found on hub")
         return
     print("adding new eval")
     eval_entry = {
         "model" : model,
         "base_model" : base_model,
@@ -166,22 +166,22 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
         "8bit_eval" : is_8_bit_eval,
         "is_delta_weight" : is_delta_weight,
         "status" : "PENDING"
-    }
     user_name = ""
     model_path = model
     if "/" in model:
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
     OUT_DIR=f"eval_requests/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
     api = HfApi()
     api.upload_file(
         path_or_fileobj=out_path,
@@ -191,14 +191,14 @@ def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool,
         repo_type="dataset",
     )
 def refresh():
     return get_leaderboard(), get_eval_table()
 block = gr.Blocks()
-with block:
     with gr.Row():
         gr.Markdown(f"""
 # 🤗 Open LLM Leaderboard
@@ -208,49 +208,47 @@ Evaluation is performed against 4 popular benchmarks:
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
-- <a href="https://arxiv.org/abs/2109.07958" target="_blank">  Truthful QA MC </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
         """)
     with gr.Row():
         leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
                                                     datatype=TYPES, max_rows=5)
     with gr.Row():
         gr.Markdown(f"""
     # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
     """)
     with gr.Accordion("Evaluation Queue", open=False):
         with gr.Row():
             eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
-                                                        datatype=EVAL_TYPES, max_rows=5)
     with gr.Row():
         refresh_button = gr.Button("Refresh")
-        refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
     with gr.Accordion("Submit a new model for evaluation"):
-        # with gr.Row():
-        #     gr.Markdown(f"""# Submit a new model for evaluation""")
         with gr.Row():
             with gr.Column():
                 model_name_textbox = gr.Textbox(label="Model name")
                 revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
             with gr.Column():
                 is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
                 private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
                 is_delta_weight = gr.Checkbox(False, label="Delta weights")
                 base_model_name_textbox = gr.Textbox(label="base model (for delta)")
         with gr.Row():
             submit_button = gr.Button("Submit Eval")
             submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
     block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
 block.launch()

     with open(file_path) as fp:
         data = json.load(fp)
     accs = np.array([v[metric] for k, v in data["results"].items()])
+    mean_acc = np.mean(accs)
     return mean_acc, data["config"]["model_args"]
+COLS = ["Model", "Revision", "Average ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthfulQA (0-shot) ⬆️"]
 TYPES = ["markdown","str", "number", "number", "number", "number", "number", ]
 if not IS_PUBLIC:
 EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
 EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
 def get_leaderboard():
+    if repo:
         print("pulling changes")
         repo.git_pull()
     all_data = get_eval_results_dicts(IS_PUBLIC)
     if not IS_PUBLIC:
         gpt4_values = {
+            "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt4</a>',
+            "Revision":"tech report",
             "8bit":None,
             "Average ⬆️":84.3,
             "ARC (25-shot) ⬆️":96.3,
             "HellaSwag (10-shot) ⬆️":95.3,
             "MMLU (5-shot) ⬆️":86.4,
+            "TruthfulQA (0-shot) ⬆️":59.0,
         }
         all_data.append(gpt4_values)
         gpt35_values = {
+            "Model":f'<a target="_blank" href=https://arxiv.org/abs/2303.08774 style="color: blue; text-decoration: underline;text-decoration-style: dotted;">gpt3.5</a>',
+            "Revision":"tech report",
             "8bit":None,
             "Average ⬆️":71.9,
             "ARC (25-shot) ⬆️":85.2,
             "HellaSwag (10-shot) ⬆️":85.5,
             "MMLU (5-shot) ⬆️":70.0,
+            "TruthfulQA (0-shot) ⬆️":47.0,
         }
         all_data.append(gpt35_values)
     dataframe = pd.DataFrame.from_records(all_data)
     dataframe = dataframe.sort_values(by=['Average ⬆️'], ascending=False)
     print(dataframe)
     return dataframe
 def get_eval_table():
+    if repo:
         print("pulling changes for eval")
         repo.git_pull()
+    entries = [entry for entry in os.listdir("evals/eval_requests") if not entry.startswith('.')]
     all_evals = []
     for entry in entries:
         print(entry)
         if ".json"in entry:
             file_path = os.path.join("evals/eval_requests", entry)
             with open(file_path) as fp:
                 data = json.load(fp)
             data["# params"] = "unknown"
             data["model"] = make_clickable_model(data["model"])
             data["revision"] = data.get("revision", "main")
             all_evals.append(data)
         else:
             # this is a folder
+            sub_entries = [e for e in os.listdir(f"evals/eval_requests/{entry}") if not e.startswith('.')]
             for sub_entry in sub_entries:
                 file_path = os.path.join("evals/eval_requests", entry, sub_entry)
                 with open(file_path) as fp:
                     data = json.load(fp)
                 #data["# params"] = get_n_params(data["model"])
                 data["model"] = make_clickable_model(data["model"])
                 all_evals.append(data)
     dataframe = pd.DataFrame.from_records(all_evals)
     return dataframe[EVAL_COLS]
     try:
         config = AutoConfig.from_pretrained(model_name, revision=revision)
         return True
     except Exception as e:
         print("Could not get the model config from the hub")
         print(e)
         return False
 def add_new_eval(model:str, base_model : str, revision:str, is_8_bit_eval: bool, private:bool, is_delta_weight:bool):
     if is_delta_weight and not is_model_on_hub(base_model, revision):
         print(base_model, "base model not found on hub")
         return
     if not is_model_on_hub(model, revision):
         print(model, "not found on hub")
         return
     print("adding new eval")
     eval_entry = {
         "model" : model,
         "base_model" : base_model,
         "8bit_eval" : is_8_bit_eval,
         "is_delta_weight" : is_delta_weight,
         "status" : "PENDING"
+    }
     user_name = ""
     model_path = model
     if "/" in model:
         user_name = model.split("/")[0]
         model_path = model.split("/")[1]
     OUT_DIR=f"eval_requests/{user_name}"
     os.makedirs(OUT_DIR, exist_ok=True)
     out_path = f"{OUT_DIR}/{model_path}_eval_request_{private}_{is_8_bit_eval}_{is_delta_weight}.json"
     with open(out_path, "w") as f:
         f.write(json.dumps(eval_entry))
     LMEH_REPO = "HuggingFaceH4/lmeh_evaluations"
     api = HfApi()
     api.upload_file(
         path_or_fileobj=out_path,
         repo_type="dataset",
     )
 def refresh():
     return get_leaderboard(), get_eval_table()
 block = gr.Blocks()
+with block:
     with gr.Row():
         gr.Markdown(f"""
 # 🤗 Open LLM Leaderboard
 - <a href="https://arxiv.org/abs/1803.05457" target="_blank">  AI2 Reasoning Challenge </a> (25-shot) - a set of grade-school science questions.
 - <a href="https://arxiv.org/abs/1905.07830" target="_blank">  HellaSwag </a> (10-shot) - a test of commonsense inference, which is easy for humans (~95%) but challenging for SOTA models.
 - <a href="https://arxiv.org/abs/2009.03300" target="_blank">  MMLU </a>  (5-shot) - a test to measure a text model's multitask accuracy. The test covers 57 tasks including elementary mathematics, US history, computer science, law, and more.
+- <a href="https://arxiv.org/abs/2109.07958" target="_blank">  TruthfulQA </a> (0-shot) - a benchmark to measure whether a language model is truthful in generating answers to questions.
 We chose these benchmarks as they test a variety of reasoning and general knowledge across a wide variety of fields in 0-shot and few-shot settings. </font>
         """)
     with gr.Row():
         leaderboard_table = gr.components.Dataframe(value=leaderboard, headers=COLS,
                                                     datatype=TYPES, max_rows=5)
     with gr.Row():
         gr.Markdown(f"""
     # Evaluation Queue for the 🤗 Open LLM Leaderboard, these models will be automatically evaluated on the 🤗 cluster
     """)
     with gr.Accordion("Evaluation Queue", open=False):
         with gr.Row():
             eval_table = gr.components.Dataframe(value=eval_queue, headers=EVAL_COLS,
+                                                        datatype=EVAL_TYPES, max_rows=5)
     with gr.Row():
         refresh_button = gr.Button("Refresh")
+        refresh_button.click(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
     with gr.Accordion("Submit a new model for evaluation"):
         with gr.Row():
             with gr.Column():
                 model_name_textbox = gr.Textbox(label="Model name")
                 revision_name_textbox = gr.Textbox(label="revision", placeholder="main")
             with gr.Column():
                 is_8bit_toggle = gr.Checkbox(False, label="8 bit eval", visible=not IS_PUBLIC)
                 private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
                 is_delta_weight = gr.Checkbox(False, label="Delta weights")
                 base_model_name_textbox = gr.Textbox(label="base model (for delta)")
         with gr.Row():
             submit_button = gr.Button("Submit Eval")
             submit_button.click(add_new_eval, [model_name_textbox, base_model_name_textbox, revision_name_textbox, is_8bit_toggle, private, is_delta_weight])
     block.load(refresh, inputs=[], outputs=[leaderboard_table, eval_table])
 block.launch()

utils.py CHANGED Viewed

@@ -21,7 +21,7 @@ BENCH_TO_NAME = {
     "arc_challenge":"ARC (25-shot) ⬆️",
      "hellaswag":"HellaSwag (10-shot) ⬆️",
      "hendrycks":"MMLU (5-shot) ⬆️",
-     "truthfulqa_mc":"TruthQA (0-shot) ⬆️",
 }
 def make_clickable_model(model_name):
     LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]

     "arc_challenge":"ARC (25-shot) ⬆️",
      "hellaswag":"HellaSwag (10-shot) ⬆️",
      "hendrycks":"MMLU (5-shot) ⬆️",
+     "truthfulqa_mc":"TruthfulQA (0-shot) ⬆️",
 }
 def make_clickable_model(model_name):
     LLAMAS = ["huggingface/llama-7b", "huggingface/llama-13b", "huggingface/llama-30b", "huggingface/llama-65b"]