Nikhil Raghavan commited on
Commit
536f8ee
·
2 Parent(s): 9775a074b5a2a3

Merge branch 'main' of https://huggingface.co/spaces/unlearningltd/leaderboard

Browse files
app.py CHANGED
@@ -1,45 +1,31 @@
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
 
7
  from src.about import (
8
  CITATION_BUTTON_LABEL,
9
  CITATION_BUTTON_TEXT,
10
- EVALUATION_QUEUE_TEXT,
11
  INTRODUCTION_TEXT,
12
- LLM_BENCHMARKS_TEXT,
13
  TITLE,
14
  )
15
  from src.display.css_html_js import custom_css
16
  from src.display.utils import (
17
- BENCHMARK_COLS,
18
  COLS,
19
- EVAL_COLS,
20
- EVAL_TYPES,
21
  AutoEvalColumn,
22
  ModelType,
23
  WeightType,
24
  Precision,
25
  fields,
26
  )
27
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
28
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
29
- from src.submission.submit import add_new_eval
30
 
31
 
32
  def restart_space():
33
  API.restart_space(repo_id=REPO_ID)
34
 
35
  ### Space initialisation
36
- try:
37
- print(EVAL_REQUESTS_PATH)
38
- snapshot_download(
39
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
40
- )
41
- except Exception:
42
- restart_space()
43
  try:
44
  print(EVAL_RESULTS_PATH)
45
  snapshot_download(
@@ -48,14 +34,11 @@ try:
48
  except Exception:
49
  restart_space()
50
 
 
51
 
52
- LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
-
54
- (
55
- finished_eval_queue_df,
56
- running_eval_queue_df,
57
- pending_eval_queue_df,
58
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
59
 
60
  def init_leaderboard(dataframe):
61
  if dataframe is None or dataframe.empty:
@@ -81,99 +64,9 @@ with demo:
81
  gr.HTML(TITLE)
82
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
83
 
84
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
85
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
86
- leaderboard = init_leaderboard(LEADERBOARD_DF)
87
-
88
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
89
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
90
-
91
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
92
- with gr.Column():
93
- with gr.Row():
94
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
95
-
96
- with gr.Column():
97
- with gr.Accordion(
98
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
99
- open=False,
100
- ):
101
- with gr.Row():
102
- finished_eval_table = gr.components.Dataframe(
103
- value=finished_eval_queue_df,
104
- headers=EVAL_COLS,
105
- datatype=EVAL_TYPES,
106
- row_count=5,
107
- )
108
- with gr.Accordion(
109
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
110
- open=False,
111
- ):
112
- with gr.Row():
113
- running_eval_table = gr.components.Dataframe(
114
- value=running_eval_queue_df,
115
- headers=EVAL_COLS,
116
- datatype=EVAL_TYPES,
117
- row_count=5,
118
- )
119
-
120
- with gr.Accordion(
121
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
122
- open=False,
123
- ):
124
- with gr.Row():
125
- pending_eval_table = gr.components.Dataframe(
126
- value=pending_eval_queue_df,
127
- headers=EVAL_COLS,
128
- datatype=EVAL_TYPES,
129
- row_count=5,
130
- )
131
- with gr.Row():
132
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
133
-
134
- with gr.Row():
135
- with gr.Column():
136
- model_name_textbox = gr.Textbox(label="Model name")
137
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
138
- model_type = gr.Dropdown(
139
- choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
140
- label="Model type",
141
- multiselect=False,
142
- value=None,
143
- interactive=True,
144
- )
145
-
146
- with gr.Column():
147
- precision = gr.Dropdown(
148
- choices=[i.value.name for i in Precision if i != Precision.Unknown],
149
- label="Precision",
150
- multiselect=False,
151
- value="float16",
152
- interactive=True,
153
- )
154
- weight_type = gr.Dropdown(
155
- choices=[i.value.name for i in WeightType],
156
- label="Weights type",
157
- multiselect=False,
158
- value="Original",
159
- interactive=True,
160
- )
161
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
162
-
163
- submit_button = gr.Button("Submit Eval")
164
- submission_result = gr.Markdown()
165
- submit_button.click(
166
- add_new_eval,
167
- [
168
- model_name_textbox,
169
- base_model_name_textbox,
170
- revision_name_textbox,
171
- precision,
172
- weight_type,
173
- model_type,
174
- ],
175
- submission_result,
176
- )
177
 
178
  with gr.Row():
179
  with gr.Accordion("📙 Citation", open=False):
 
1
  import gradio as gr
2
  from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
 
3
  from apscheduler.schedulers.background import BackgroundScheduler
4
  from huggingface_hub import snapshot_download
5
 
6
  from src.about import (
7
  CITATION_BUTTON_LABEL,
8
  CITATION_BUTTON_TEXT,
 
9
  INTRODUCTION_TEXT,
 
10
  TITLE,
11
  )
12
  from src.display.css_html_js import custom_css
13
  from src.display.utils import (
 
14
  COLS,
 
 
15
  AutoEvalColumn,
16
  ModelType,
17
  WeightType,
18
  Precision,
19
  fields,
20
  )
21
+ from src.envs import API, EVAL_RESULTS_PATH, REPO_ID, RESULTS_REPO, TOKEN
22
+ from src.populate import get_leaderboard_df
 
23
 
24
 
25
  def restart_space():
26
  API.restart_space(repo_id=REPO_ID)
27
 
28
  ### Space initialisation
 
 
 
 
 
 
 
29
  try:
30
  print(EVAL_RESULTS_PATH)
31
  snapshot_download(
 
34
  except Exception:
35
  restart_space()
36
 
37
+ """ adapted from original template, deleted everything related to queue and request, and unrelated 'titles'
38
 
39
+ our leaderboard does not have a submission queue system, does not use request, reads directly from the result repository, and displays the leaderboard
40
+ """
41
+ LEADERBOARD_DF = get_leaderboard_df(EVAL_RESULTS_PATH, None, COLS, []) # empty arguments to meet the function requirement
 
 
 
 
42
 
43
  def init_leaderboard(dataframe):
44
  if dataframe is None or dataframe.empty:
 
64
  gr.HTML(TITLE)
65
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
66
 
67
+ with gr.Tabs(elem_classes="tab-buttons") as tabs: # only one tabitem left
68
+ with gr.TabItem("Leaderboard"):
69
+ leaderboard = init_leaderboard(LEADERBOARD_DF)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
  with gr.Row():
72
  with gr.Accordion("📙 Citation", open=False):
src/about.py CHANGED
@@ -32,45 +32,6 @@ INTRODUCTION_TEXT = """
32
  Intro text
33
  """
34
 
35
- # Which evaluations are you running? how can people reproduce what you have?
36
- LLM_BENCHMARKS_TEXT = f"""
37
- ## How it works
38
-
39
- ## Reproducibility
40
- To reproduce our results, here is the commands you can run:
41
-
42
- """
43
-
44
- EVALUATION_QUEUE_TEXT = """
45
- ## Some good practices before submitting a model
46
-
47
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
48
- ```python
49
- from transformers import AutoConfig, AutoModel, AutoTokenizer
50
- config = AutoConfig.from_pretrained("your model name", revision=revision)
51
- model = AutoModel.from_pretrained("your model name", revision=revision)
52
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
53
- ```
54
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
55
-
56
- Note: make sure your model is public!
57
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
58
-
59
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
60
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
61
-
62
- ### 3) Make sure your model has an open license!
63
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
64
-
65
- ### 4) Fill up your model card
66
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
67
-
68
- ## In case of model failure
69
- If your model is displayed in the `FAILED` category, its execution stopped.
70
- Make sure you have followed the above steps first.
71
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
72
- """
73
-
74
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
75
  CITATION_BUTTON_TEXT = r"""
76
  """
 
32
  Intro text
33
  """
34
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
36
  CITATION_BUTTON_TEXT = r"""
37
  """
src/display/utils.py CHANGED
@@ -1,17 +1,13 @@
1
  from dataclasses import dataclass, field, make_dataclass
2
  from enum import Enum
3
 
4
- import pandas as pd
5
-
6
- from src.about import Tasks
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
11
-
12
- # These classes are for user facing column names,
13
- # to avoid having to change them all around the code
14
- # when a modif is needed
15
  @dataclass
16
  class ColumnContent:
17
  name: str
@@ -92,8 +88,5 @@ class Precision(Enum):
92
  # Column selection
93
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
94
 
95
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
96
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
97
-
98
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
99
 
 
1
  from dataclasses import dataclass, field, make_dataclass
2
  from enum import Enum
3
 
4
+ """ adapted from original template, where unnecessary code was removed
5
+ util.py is used for defining our fixed columns, which will be referenced to from app.py
6
+ ColumnContent dataclass used to define column properties"""
7
 
8
  def fields(raw_class):
9
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
10
 
 
 
 
 
11
  @dataclass
12
  class ColumnContent:
13
  name: str
 
88
  # Column selection
89
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
90
 
91
+ BENCHMARK_COLS = []
 
 
 
92
 
src/envs.py CHANGED
@@ -1,25 +1,21 @@
1
  import os
2
 
3
  from huggingface_hub import HfApi
4
-
5
  # Info to change for your repository
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
- OWNER = "ongks1999" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
  RESULTS_REPO = f"{OWNER}/results"
15
 
16
  # If you setup a cache later, just change HF_HOME
17
  CACHE_PATH=os.getenv("HF_HOME", ".")
18
 
19
  # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
 
25
  API = HfApi(token=TOKEN)
 
1
  import os
2
 
3
  from huggingface_hub import HfApi
4
+ """ adapted from original template, removed unnecessary code """
5
  # Info to change for your repository
6
  # ----------------------------------
7
  TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
 
9
+ OWNER = "Unlearningltd" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
  # ----------------------------------
11
 
12
  REPO_ID = f"{OWNER}/leaderboard"
 
13
  RESULTS_REPO = f"{OWNER}/results"
14
 
15
  # If you setup a cache later, just change HF_HOME
16
  CACHE_PATH=os.getenv("HF_HOME", ".")
17
 
18
  # Local caches
 
19
  EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
 
 
20
 
21
  API = HfApi(token=TOKEN)
src/leaderboard/read_evals.py CHANGED
@@ -5,6 +5,13 @@ from dataclasses import dataclass, field
5
  from src.display.utils import AutoEvalColumn
6
  from src.about import Tasks
7
 
 
 
 
 
 
 
 
8
 
9
  @dataclass
10
  class EvalResult:
 
5
  from src.display.utils import AutoEvalColumn
6
  from src.about import Tasks
7
 
8
+ for file in files: # each json file has its own row in the data frame
9
+ with open(file, 'r') as file_json:
10
+ data = json.load(file_json)
11
+ row = {"technique": data.get("technique_name", None)} # metric result is a nested dict
12
+ for eval_method, result in data.get("metric_results", {}).items(): # used .get() to prevent KeyError
13
+ row[eval_method] = result.get('value') # multiple eval results under metric results
14
+ data_rows.append(row)
15
 
16
  @dataclass
17
  class EvalResult:
src/populate.py CHANGED
@@ -1,12 +1,7 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
  from src.leaderboard.read_evals import get_raw_eval_results
 
9
 
 
10
 
11
  def get_leaderboard_df(results_path: str, requests_path: str = None, cols: list = None, benchmark_cols: list = None) -> pd.DataFrame:
12
  """Creates a dataframe from all the individual experiment results"""
 
 
 
 
 
 
 
 
1
  from src.leaderboard.read_evals import get_raw_eval_results
2
+ import pandas as pd
3
 
4
+ """ calls get_raw_eval_results function from our read_evals.py file to get the DataFrame"""
5
 
6
  def get_leaderboard_df(results_path: str, requests_path: str = None, cols: list = None, benchmark_cols: list = None) -> pd.DataFrame:
7
  """Creates a dataframe from all the individual experiment results"""