open_pt_llm_leaderboard

Running on CPU Upgrade

App Files Files Community

eduagarcia commited on Jan 21, 2024

Commit

439afd4

1 Parent(s): 7a2430c

Fix model eval links and remove huggingface icon from Leaderboard name

Browse files

Files changed (5) hide show

src/display/about.py +8 -7
src/display/formatting.py +18 -3
src/display/utils.py +1 -1
src/leaderboard/read_evals.py +6 -1
src/populate.py +3 -3

src/display/about.py CHANGED Viewed

@@ -4,21 +4,22 @@ from src.envs import REPO_ID, QUEUE_REPO, RESULTS_REPO, PATH_TO_COLLECTION, LEAD
 LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
-TITLE = F"""<h1 align="center" id="space-title">🤗 {LEADERBOARD_NAME}</h1>"""
 INTRODUCTION_TEXT = f"""
-📐 The 🤗 {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
-🤗 Submit a model for automated evaluation on the 🤗 GPU cluster on the "Submit" page!
 The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 """
 TASKS_LIST= ""
 for task in Tasks:
       task = task.value
       TASKS_LIST += f'- <a href="{task.link}" target="_blank">  {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
 TASKS_PARAMETERS = ""
 for task in Tasks:
@@ -33,7 +34,7 @@ With the plethora of large language models (LLMs) and chatbots being released we
 ## How it works
-📈 We evaluate models on 7 key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 {TASKS_LIST}
@@ -133,9 +134,9 @@ I have an issue about accessing the leaderboard through the Gradio API
 EVALUATION_QUEUE_TEXT = f"""
-# Evaluation Queue for the 🤗 {LEADERBOARD_NAME}
-Models added here will be automatically evaluated on the 🤗 cluster.
 ## First steps before submitting a model

 LM_EVAL_URL = "https://github.com/eduagarcia/lm-evaluation-harness-pt"
+TITLE = F"""<h1 align="center" id="space-title">📈 {LEADERBOARD_NAME}</h1>"""
 INTRODUCTION_TEXT = f"""
+📐 The 📈 {LEADERBOARD_NAME} aims to track, rank and evaluate open LLMs and chatbots.
 This is a fork of the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" target="_blank">🤗 Open LLM Leaderboard</a> with different benchmarks.
+Submit a model for automated evaluation on our GPU cluster on the "Submit" page!
 The leaderboard's backend runs the great [Eleuther AI Language Model Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) - read more details in the "About" page!
 """
+task_count = 0
 TASKS_LIST= ""
 for task in Tasks:
       task = task.value
       TASKS_LIST += f'- <a href="{task.link}" target="_blank">  {task.col_name} </a> ({task.few_shot}-shot) - {task.description}\n'
+      task_count += 1
 TASKS_PARAMETERS = ""
 for task in Tasks:
 ## How it works
+📈 We evaluate models on {task_count} key benchmarks using the <a href="https://github.com/EleutherAI/lm-evaluation-harness" target="_blank">  Eleuther AI Language Model Evaluation Harness </a>, a unified framework to test generative language models on a large number of different evaluation tasks.
 {TASKS_LIST}
 EVALUATION_QUEUE_TEXT = f"""
+# Evaluation Queue for the 📈 {LEADERBOARD_NAME}
+Models added here will be automatically evaluated on our evaluation cluster.
 ## First steps before submitting a model

src/display/formatting.py CHANGED Viewed

@@ -4,6 +4,7 @@ from datetime import datetime, timezone
 from huggingface_hub import HfApi
 from huggingface_hub.hf_api import ModelInfo
 API = HfApi()
@@ -11,11 +12,25 @@ def model_hyperlink(link, model_name):
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
-def make_clickable_model(model_name):
     link = f"https://huggingface.co/{model_name}"
-    details_model_name = model_name.replace("/", "__")
-    details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

 from huggingface_hub import HfApi
 from huggingface_hub.hf_api import ModelInfo
+from src.envs import RESULTS_REPO, QUEUE_REPO
 API = HfApi()
     return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
+def make_requests_clickable_model(model_name, json_path=None):
     link = f"https://huggingface.co/{model_name}"
+    #details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/tree/main"
+    if '/' in model_name:
+        details_link += f"/{model_name.split('/')[0]}"
+    if json_path is not None:
+        details_link = f"https://huggingface.co/datasets/{QUEUE_REPO}/blob/main/{json_path}"
+    return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")
+def make_clickable_model(model_name, json_path=None):
+    link = f"https://huggingface.co/{model_name}"
+    #details_model_name = model_name.replace("/", "__")
+    details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/tree/main/{model_name}"
+    if json_path is not None:
+        details_link = f"https://huggingface.co/datasets/{RESULTS_REPO}/blob/main/{model_name}/{json_path}"
     return model_hyperlink(link, model_name) + "  " + model_hyperlink(details_link, "📑")

src/display/utils.py CHANGED Viewed

@@ -31,7 +31,7 @@ class Tasks(Enum):
         limit=None,
         task_list=["oab_exams_generate"],
         link="https://huggingface.co/datasets/eduagarcia/oab_exams",
-        description="OAB Exams is a dataset of 1,000 questions from the Brazilian Bar Association's exams."
     )
     brazilian_court_decisions_judgment = Task(
         benchmark="brazilian_court_decisions_judgment",

         limit=None,
         task_list=["oab_exams_generate"],
         link="https://huggingface.co/datasets/eduagarcia/oab_exams",
+        description="OAB Exams is a dataset of 2,000 questions from the Brazilian Bar Association's exams."
     )
     brazilian_court_decisions_judgment = Task(
         benchmark="brazilian_court_decisions_judgment",

src/leaderboard/read_evals.py CHANGED Viewed

@@ -35,6 +35,7 @@ class EvalResult:
     flagged: bool = False
     status: str = "FINISHED"
     tags: list = None
     @classmethod
     def init_from_json_file(self, json_filepath):
@@ -42,6 +43,8 @@ class EvalResult:
         with open(json_filepath) as fp:
             data = json.load(fp)
         # We manage the legacy config format
         config = data.get("config_general")
@@ -100,6 +103,7 @@ class EvalResult:
             results=results,
             precision=precision,
             revision= config.get("model_sha", ""),
         )
     def update_with_request_file(self, requests_path):
@@ -137,7 +141,7 @@ class EvalResult:
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
-            AutoEvalColumn.model.name: make_clickable_model(self.full_model),
             AutoEvalColumn.dummy.name: self.full_model,
             AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
@@ -202,6 +206,7 @@ def get_raw_eval_results(results_path: str, requests_path: str, dynamic_path: st
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:

     flagged: bool = False
     status: str = "FINISHED"
     tags: list = None
+    json_filename: str = None
     @classmethod
     def init_from_json_file(self, json_filepath):
         with open(json_filepath) as fp:
             data = json.load(fp)
+        json_filename = os.path.basename(json_filepath)
         # We manage the legacy config format
         config = data.get("config_general")
             results=results,
             precision=precision,
             revision= config.get("model_sha", ""),
+            json_filename=json_filename
         )
     def update_with_request_file(self, requests_path):
             AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
             AutoEvalColumn.weight_type.name: self.weight_type.value.name,
             AutoEvalColumn.architecture.name: self.architecture,
+            AutoEvalColumn.model.name: make_clickable_model(self.full_model, self.json_filename),
             AutoEvalColumn.dummy.name: self.full_model,
             AutoEvalColumn.revision.name: self.revision,
             AutoEvalColumn.average.name: average,
     eval_results = {}
     for model_result_filepath in model_result_filepaths:
         # Creation of result
+        print(model_result_filepath)
         eval_result = EvalResult.init_from_json_file(model_result_filepath)
         eval_result.update_with_request_file(requests_path)
         if eval_result.full_model in dynamic_data:

src/populate.py CHANGED Viewed

@@ -3,7 +3,7 @@ import os
 import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models
 from src.leaderboard.read_evals import get_raw_eval_results
@@ -35,7 +35,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
             with open(file_path) as fp:
                 data = json.load(fp)
-            data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
@@ -47,7 +47,7 @@ def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
                 with open(file_path) as fp:
                     data = json.load(fp)
-                data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
                 data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)

 import pandas as pd
+from src.display.formatting import has_no_nan_values, make_requests_clickable_model
 from src.display.utils import AutoEvalColumn, EvalQueueColumn, baseline_row
 from src.leaderboard.filter_models import filter_models
 from src.leaderboard.read_evals import get_raw_eval_results
             with open(file_path) as fp:
                 data = json.load(fp)
+            data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], entry)
             data[EvalQueueColumn.revision.name] = data.get("revision", "main")
             all_evals.append(data)
                 with open(file_path) as fp:
                     data = json.load(fp)
+                data[EvalQueueColumn.model.name] = make_requests_clickable_model(data["model"], os.path.join(entry, sub_entry))
                 data[EvalQueueColumn.revision.name] = data.get("revision", "main")
                 all_evals.append(data)