leaderboard

Running

App Files Files Community

Nikhil Raghavan commited on 25 days ago

Commit

536f8ee

2 Parent(s): 9775a07 4b5a2a3

Merge branch 'main' of https://huggingface.co/spaces/unlearningltd/leaderboard

Browse files

Files changed (6) hide show

app.py +9 -116
src/about.py +0 -39
src/display/utils.py +4 -11
src/envs.py +2 -6
src/leaderboard/read_evals.py +7 -0
src/populate.py +2 -7

app.py CHANGED Viewed

@@ -1,45 +1,31 @@
 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
-import pandas as pd
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
-    EVALUATION_QUEUE_TEXT,
     INTRODUCTION_TEXT,
-    LLM_BENCHMARKS_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
-    BENCHMARK_COLS,
     COLS,
-    EVAL_COLS,
-    EVAL_TYPES,
     AutoEvalColumn,
     ModelType,
     WeightType,
     Precision,
     fields,
 )
-from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
-from src.populate import get_evaluation_queue_df, get_leaderboard_df
-from src.submission.submit import add_new_eval
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
-try:
-    print(EVAL_REQUESTS_PATH)
-    snapshot_download(
-        repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
-    )
-except Exception:
-    restart_space()
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
@@ -48,14 +34,11 @@ try:
 except Exception:
     restart_space()
-LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
-(
-    finished_eval_queue_df,
-    running_eval_queue_df,
-    pending_eval_queue_df,
-) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
@@ -81,99 +64,9 @@ with demo:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
-    with gr.Tabs(elem_classes="tab-buttons") as tabs:
-        with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
-            leaderboard = init_leaderboard(LEADERBOARD_DF)
-        with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
-            gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
-        with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
-            with gr.Column():
-                with gr.Row():
-                    gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
-                with gr.Column():
-                    with gr.Accordion(
-                        f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            finished_eval_table = gr.components.Dataframe(
-                                value=finished_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            running_eval_table = gr.components.Dataframe(
-                                value=running_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-                    with gr.Accordion(
-                        f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
-                        open=False,
-                    ):
-                        with gr.Row():
-                            pending_eval_table = gr.components.Dataframe(
-                                value=pending_eval_queue_df,
-                                headers=EVAL_COLS,
-                                datatype=EVAL_TYPES,
-                                row_count=5,
-                            )
-            with gr.Row():
-                gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
-            with gr.Row():
-                with gr.Column():
-                    model_name_textbox = gr.Textbox(label="Model name")
-                    revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
-                    model_type = gr.Dropdown(
-                        choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
-                        label="Model type",
-                        multiselect=False,
-                        value=None,
-                        interactive=True,
-                    )
-                with gr.Column():
-                    precision = gr.Dropdown(
-                        choices=[i.value.name for i in Precision if i != Precision.Unknown],
-                        label="Precision",
-                        multiselect=False,
-                        value="float16",
-                        interactive=True,
-                    )
-                    weight_type = gr.Dropdown(
-                        choices=[i.value.name for i in WeightType],
-                        label="Weights type",
-                        multiselect=False,
-                        value="Original",
-                        interactive=True,
-                    )
-                    base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
-            submit_button = gr.Button("Submit Eval")
-            submission_result = gr.Markdown()
-            submit_button.click(
-                add_new_eval,
-                [
-                    model_name_textbox,
-                    base_model_name_textbox,
-                    revision_name_textbox,
-                    precision,
-                    weight_type,
-                    model_type,
-                ],
-                submission_result,
-            )
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

 import gradio as gr
 from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 from apscheduler.schedulers.background import BackgroundScheduler
 from huggingface_hub import snapshot_download
 from src.about import (
     CITATION_BUTTON_LABEL,
     CITATION_BUTTON_TEXT,
     INTRODUCTION_TEXT,
     TITLE,
 )
 from src.display.css_html_js import custom_css
 from src.display.utils import (
     COLS,
     AutoEvalColumn,
     ModelType,
     WeightType,
     Precision,
     fields,
 )
+from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
+from src.populate import get_leaderboard_df
 def restart_space():
     API.restart_space(repo_id=REPO_ID)
 ### Space initialisation
 try:
     print(EVAL_RESULTS_PATH)
     snapshot_download(
 except Exception:
     restart_space()
+""" adapted from original template, deleted everything related to queue and request, and unrelated 'titles'
+our leaderboard does not have a submission queue system, does not use request, reads directly from the result repository, and displays the leaderboard
+ """
+LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, None, COLS, []) # empty arguments to meet the function requirement
 def init_leaderboard(dataframe):
     if dataframe is None or dataframe.empty:
     gr.HTML(TITLE)
     gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
+    with gr.Tabs(elem_classes="tab-buttons") as tabs: # only one tabitem left
+            with gr.TabItem("Leaderboard"):
+                leaderboard = init_leaderboard(LEADERBOARD_DF)
     with gr.Row():
         with gr.Accordion("📙 Citation", open=False):

src/about.py CHANGED Viewed

@@ -32,45 +32,6 @@ INTRODUCTION_TEXT = """
 Intro text
 """
-# Which evaluations are you running? how can people reproduce what you have?
-LLM_BENCHMARKS_TEXT = f"""
-## How it works
-## Reproducibility
-To reproduce our results, here is the commands you can run:
-"""
-EVALUATION_QUEUE_TEXT = """
-## Some good practices before submitting a model
-### 1) Make sure you can load your model and tokenizer using AutoClasses:
-```python
-from transformers import AutoConfig, AutoModel, AutoTokenizer
-config = AutoConfig.from_pretrained("your model name", revision=revision)
-model = AutoModel.from_pretrained("your model name", revision=revision)
-tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
-```
-If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
-Note: make sure your model is public!
-Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
-### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
-It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
-### 3) Make sure your model has an open license!
-This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
-### 4) Fill up your model card
-When we add extra information about models to the leaderboard, it will be automatically taken from the model card
-## In case of model failure
-If your model is displayed in the `FAILED` category, its execution stopped.
-Make sure you have followed the above steps first.
-If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
-"""
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

 Intro text
 """
 CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
 CITATION_BUTTON_TEXT = r"""
 """

src/display/utils.py CHANGED Viewed

@@ -1,17 +1,13 @@
 from dataclasses import dataclass, field, make_dataclass
 from enum import Enum
-import pandas as pd
-from src.about import Tasks
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
-# These classes are for user facing column names,
-# to avoid having to change them all around the code
-# when a modif is needed
 @dataclass
 class ColumnContent:
     name: str
@@ -92,8 +88,5 @@ class Precision(Enum):
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
-EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
-EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
-BENCHMARK_COLS = [t.value.col_name for t in Tasks]

 from dataclasses import dataclass, field, make_dataclass
 from enum import Enum
+""" adapted from original template, where unnecessary code was removed
+util.py is used for defining our fixed columns, which will be referenced to from app.py
+ColumnContent dataclass used to define column properties"""
 def fields(raw_class):
     return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 @dataclass
 class ColumnContent:
     name: str
 # Column selection
 COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
+BENCHMARK_COLS = []

src/envs.py CHANGED Viewed

@@ -1,25 +1,21 @@
 import os
 from huggingface_hub import HfApi
 # Info to change for your repository
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
-OWNER = "ongks1999" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
-QUEUE_REPO = f"{OWNER}/requests"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
-EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
-EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
-EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
 API = HfApi(token=TOKEN)

 import os
 from huggingface_hub import HfApi
+""" adapted from original template, removed unnecessary code """
 # Info to change for your repository
 # ----------------------------------
 TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
+OWNER = "Unlearningltd" # Change to your org - don't forget to create a results and request dataset, with the correct format!
 # ----------------------------------
 REPO_ID = f"{OWNER}/leaderboard"
 RESULTS_REPO = f"{OWNER}/results"
 # If you setup a cache later, just change HF_HOME
 CACHE_PATH=os.getenv("HF_HOME", ".")
 # Local caches
 EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 API = HfApi(token=TOKEN)

src/leaderboard/read_evals.py CHANGED Viewed

@@ -5,6 +5,13 @@ from dataclasses import dataclass, field
 from src.display.utils import AutoEvalColumn
 from src.about import Tasks
 @dataclass
 class EvalResult:

 from src.display.utils import AutoEvalColumn
 from src.about import Tasks
+    for file in files: # each json file has its own row in the data frame
+        with open(file, 'r') as file_json:
+            data = json.load(file_json)
+            row = {"technique": data.get("technique_name", None)} # metric result is a nested dict
+            for eval_method, result in data.get("metric_results", {}).items(): # used .get() to prevent KeyError
+                row[eval_method] = result.get('value') # multiple eval results under metric results
+            data_rows.append(row)
 @dataclass
 class EvalResult:

src/populate.py CHANGED Viewed

@@ -1,12 +1,7 @@
-import json
-import os
-import pandas as pd
-from src.display.formatting import has_no_nan_values, make_clickable_model
-from src.display.utils import AutoEvalColumn, EvalQueueColumn
 from src.leaderboard.read_evals import get_raw_eval_results
 def get_leaderboard_df(results_path: str, requests_path: str = None, cols: list = None, benchmark_cols: list = None) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""

 from src.leaderboard.read_evals import get_raw_eval_results
+import pandas as pd
+""" calls get_raw_eval_results function from our read_evals.py file to get the DataFrame"""
 def get_leaderboard_df(results_path: str, requests_path: str = None, cols: list = None, benchmark_cols: list = None) -> pd.DataFrame:
     """Creates a dataframe from all the individual experiment results"""