pminervini commited on
Commit
018441b
1 Parent(s): b2cd23e
app.py CHANGED
@@ -22,40 +22,51 @@ from src.display.utils import (
22
  AutoEvalColumn,
23
  ModelType,
24
  fields,
 
 
25
  )
26
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
27
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
28
  from src.submission.submit import add_new_eval
29
- # from src.submission.check_validity import already_submitted_models
30
- # from src.tools.collections import update_collections
31
  from src.tools.plots import (
32
  create_metric_plot_obj,
33
  create_plot_df,
34
  create_scores_df,
35
  )
36
 
 
37
  def restart_space():
38
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
39
 
40
  try:
41
  print(EVAL_REQUESTS_PATH)
42
- snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
43
  except Exception:
44
  restart_space()
45
  try:
46
  print(EVAL_RESULTS_PATH)
47
- snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
 
 
48
  except Exception:
49
  restart_space()
50
 
51
 
52
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
53
- # update_collections(original_df.copy())
54
  leaderboard_df = original_df.copy()
55
 
56
  plot_df = create_plot_df(create_scores_df(raw_data))
57
 
58
- (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
 
 
 
 
59
 
60
 
61
  # Searching and filtering
@@ -177,8 +188,8 @@ with demo:
177
  )
178
  filter_columns_precision = gr.CheckboxGroup(
179
  label="Precision",
180
- choices=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
181
- value=["torch.float16", "torch.bfloat16", "torch.float32", "8bit", "4bit", "GPTQ"],
182
  interactive=True,
183
  elem_id="filter-columns-precision",
184
  )
@@ -308,7 +319,7 @@ with demo:
308
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
309
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
310
  model_type = gr.Dropdown(
311
- choices=[t.to_str(" : ") for t in ModelType],
312
  label="Model type",
313
  multiselect=False,
314
  value=None,
@@ -317,14 +328,14 @@ with demo:
317
 
318
  with gr.Column():
319
  precision = gr.Dropdown(
320
- choices=["float16", "bfloat16", "8bit (LLM.int8)", "4bit (QLoRA / FP4)", "GPTQ"],
321
  label="Precision",
322
  multiselect=False,
323
  value="float16",
324
  interactive=True,
325
  )
326
  weight_type = gr.Dropdown(
327
- choices=["Original", "Delta", "Adapter"],
328
  label="Weights type",
329
  multiselect=False,
330
  value="Original",
 
22
  AutoEvalColumn,
23
  ModelType,
24
  fields,
25
+ WeightType,
26
+ Precision
27
  )
28
  from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
29
  from src.populate import get_evaluation_queue_df, get_leaderboard_df
30
  from src.submission.submit import add_new_eval
31
+ from src.submission.check_validity import already_submitted_models
32
+ from src.tools.collections import update_collections
33
  from src.tools.plots import (
34
  create_metric_plot_obj,
35
  create_plot_df,
36
  create_scores_df,
37
  )
38
 
39
+
40
  def restart_space():
41
  API.restart_space(repo_id=REPO_ID, token=H4_TOKEN)
42
 
43
  try:
44
  print(EVAL_REQUESTS_PATH)
45
+ snapshot_download(
46
+ repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
47
+ )
48
  except Exception:
49
  restart_space()
50
  try:
51
  print(EVAL_RESULTS_PATH)
52
+ snapshot_download(
53
+ repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30
54
+ )
55
  except Exception:
56
  restart_space()
57
 
58
 
59
  raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
60
+ update_collections(original_df.copy())
61
  leaderboard_df = original_df.copy()
62
 
63
  plot_df = create_plot_df(create_scores_df(raw_data))
64
 
65
+ (
66
+ finished_eval_queue_df,
67
+ running_eval_queue_df,
68
+ pending_eval_queue_df,
69
+ ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
70
 
71
 
72
  # Searching and filtering
 
188
  )
189
  filter_columns_precision = gr.CheckboxGroup(
190
  label="Precision",
191
+ choices=[i.value.name for i in Precision],
192
+ value=[i.value.name for i in Precision],
193
  interactive=True,
194
  elem_id="filter-columns-precision",
195
  )
 
319
  revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
320
  private = gr.Checkbox(False, label="Private", visible=not IS_PUBLIC)
321
  model_type = gr.Dropdown(
322
+ choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown],
323
  label="Model type",
324
  multiselect=False,
325
  value=None,
 
328
 
329
  with gr.Column():
330
  precision = gr.Dropdown(
331
+ choices=[i.value.name for i in Precision if i != Precision.Unknown],
332
  label="Precision",
333
  multiselect=False,
334
  value="float16",
335
  interactive=True,
336
  )
337
  weight_type = gr.Dropdown(
338
+ choices=[i.value.name for i in WeightType],
339
  label="Weights type",
340
  multiselect=False,
341
  value="Original",
beta-cli.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ from huggingface_hub import snapshot_download
4
+ from src.leaderboard.read_evals import get_raw_eval_results
5
+ from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, H4_TOKEN, IS_PUBLIC, QUEUE_REPO, REPO_ID, RESULTS_REPO
6
+
7
+ snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
8
+ snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30)
9
+
10
+ raw_data = get_raw_eval_results(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH)
11
+
12
+ for entry in raw_data:
13
+ if '125' in entry.eval_name:
14
+ print(entry)
15
+
16
+ # print(raw_data)
requirements.txt CHANGED
@@ -17,6 +17,5 @@ python-dateutil==2.8.2
17
  requests==2.28.2
18
  semantic-version==2.10.0
19
  tqdm==4.65.0
20
- git+https://github.com/clefourrier/transformers.git@req-fix#egg=transformers
21
- #transformers==4.35.1
22
  tokenizers>=0.15.0
 
17
  requests==2.28.2
18
  semantic-version==2.10.0
19
  tqdm==4.65.0
20
+ transformers==4.35.2
 
21
  tokenizers>=0.15.0
src/display/formatting.py CHANGED
@@ -7,23 +7,6 @@ from huggingface_hub.hf_api import ModelInfo
7
 
8
  API = HfApi()
9
 
10
- LLAMAS = [
11
- "huggingface/llama-7b",
12
- "huggingface/llama-13b",
13
- "huggingface/llama-30b",
14
- "huggingface/llama-65b",
15
- ]
16
-
17
- KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
18
- VICUNA_LINK = "https://huggingface.co/lmsys/vicuna-13b-delta-v1.1"
19
- OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
20
- DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
21
- MODEL_PAGE = "https://huggingface.co/models"
22
- LLAMA_LINK = "https://ai.facebook.com/blog/large-language-model-llama-meta-ai/"
23
- VICUNA_LINK = "https://huggingface.co/CarperAI/stable-vicuna-13b-delta"
24
- ALPACA_LINK = "https://crfm.stanford.edu/2023/03/13/alpaca.html"
25
-
26
-
27
  def model_hyperlink(link, model_name):
28
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
29
 
@@ -31,44 +14,9 @@ def model_hyperlink(link, model_name):
31
  def make_clickable_model(model_name):
32
  link = f"https://huggingface.co/{model_name}"
33
 
34
- if model_name in LLAMAS:
35
- link = LLAMA_LINK
36
- model_name = model_name.split("/")[1]
37
- elif model_name == "HuggingFaceH4/stable-vicuna-13b-2904":
38
- link = VICUNA_LINK
39
- model_name = "stable-vicuna-13b"
40
- elif model_name == "HuggingFaceH4/llama-7b-ift-alpaca":
41
- link = ALPACA_LINK
42
- model_name = "alpaca-13b"
43
- if model_name == "dolly-12b":
44
- link = DOLLY_LINK
45
- elif model_name == "vicuna-13b":
46
- link = VICUNA_LINK
47
- elif model_name == "koala-13b":
48
- link = KOALA_LINK
49
- elif model_name == "oasst-12b":
50
- link = OASST_LINK
51
-
52
  details_model_name = model_name.replace("/", "__")
53
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
54
 
55
- if not bool(os.getenv("DEBUG", "False")):
56
- # We only add these checks when not debugging, as they are extremely slow
57
- print(f"details_link: {details_link}")
58
- try:
59
- check_path = list(
60
- API.list_files_info(
61
- repo_id=f"open-llm-leaderboard/details_{details_model_name}",
62
- paths="README.md",
63
- repo_type="dataset",
64
- )
65
- )
66
- print(f"check_path: {check_path}")
67
- except Exception as err:
68
- # No details repo for this model
69
- print(f"No details repo for this model: {err}")
70
- return model_hyperlink(link, model_name)
71
-
72
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
73
 
74
 
 
7
 
8
  API = HfApi()
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def model_hyperlink(link, model_name):
11
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
12
 
 
14
  def make_clickable_model(model_name):
15
  link = f"https://huggingface.co/{model_name}"
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  details_model_name = model_name.replace("/", "__")
18
  details_link = f"https://huggingface.co/datasets/open-llm-leaderboard/details_{details_model_name}"
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  return model_hyperlink(link, model_name) + " " + model_hyperlink(details_link, "📑")
21
 
22
 
src/display/utils.py CHANGED
@@ -1,8 +1,26 @@
1
- from dataclasses import dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  # These classes are for user facing column names,
8
  # to avoid having to change them all around the code
@@ -16,39 +34,29 @@ class ColumnContent:
16
  never_hidden: bool = False
17
  dummy: bool = False
18
 
19
-
20
- def fields(raw_class):
21
- return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
22
-
23
-
24
- @dataclass(frozen=True)
25
- class AutoEvalColumn: # Auto evals column
26
- model_type_symbol = ColumnContent("T", "str", True, never_hidden=True)
27
- model = ColumnContent("Model", "markdown", True, never_hidden=True)
28
- average = ColumnContent("Average ⬆️", "number", True)
29
-
30
- arc = ColumnContent("ARC", "number", True)
31
- hellaswag = ColumnContent("HellaSwag", "number", True)
32
- mmlu = ColumnContent("MMLU", "number", True)
33
- truthfulqa = ColumnContent("TruthfulQA", "number", True)
34
- winogrande = ColumnContent("Winogrande", "number", True)
35
- gsm8k = ColumnContent("GSM8K", "number", True)
36
- drop = ColumnContent("DROP", "number", True)
37
- nqopen = ColumnContent("NQ Open", "number", True)
38
-
39
- model_type = ColumnContent("Type", "str", False)
40
- architecture = ColumnContent("Architecture", "str", False)
41
- weight_type = ColumnContent("Weight type", "str", False, True)
42
- precision = ColumnContent("Precision", "str", False) # , True)
43
- license = ColumnContent("Hub License", "str", False)
44
- params = ColumnContent("#Params (B)", "number", False)
45
- likes = ColumnContent("Hub ❤️", "number", False)
46
- still_on_hub = ColumnContent("Available on the hub", "bool", False)
47
- revision = ColumnContent("Model sha", "str", False, False)
48
- dummy = ColumnContent(
49
- "model_name_for_query", "str", False, dummy=True
50
- ) # dummy col to implement search bar (hidden by custom CSS)
51
-
52
 
53
  @dataclass(frozen=True)
54
  class EvalQueueColumn: # Queue column
@@ -102,17 +110,17 @@ human_baseline_row = {
102
  }
103
 
104
  @dataclass
105
- class ModelTypeDetails:
106
  name: str
107
- symbol: str # emoji
108
 
109
 
110
  class ModelType(Enum):
111
- PT = ModelTypeDetails(name="pretrained", symbol="🟢")
112
- FT = ModelTypeDetails(name="fine-tuned", symbol="🔶")
113
- IFT = ModelTypeDetails(name="instruction-tuned", symbol="")
114
- RL = ModelTypeDetails(name="RL-tuned", symbol="🟦")
115
- Unknown = ModelTypeDetails(name="", symbol="?")
116
 
117
  def to_str(self, separator=" "):
118
  return f"{self.value.symbol}{separator}{self.value.name}"
@@ -129,23 +137,33 @@ class ModelType(Enum):
129
  return ModelType.IFT
130
  return ModelType.Unknown
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
- @dataclass
134
- class Task:
135
- benchmark: str
136
- metric: str
137
- col_name: str
138
-
139
-
140
- class Tasks(Enum):
141
- arc = Task("arc:challenge", "acc_norm", AutoEvalColumn.arc.name)
142
- hellaswag = Task("hellaswag", "acc_norm", AutoEvalColumn.hellaswag.name)
143
- mmlu = Task("hendrycksTest", "acc", AutoEvalColumn.mmlu.name)
144
- truthfulqa = Task("truthfulqa:mc", "mc2", AutoEvalColumn.truthfulqa.name)
145
- winogrande = Task("winogrande", "acc", AutoEvalColumn.winogrande.name)
146
- gsm8k = Task("gsm8k", "acc", AutoEvalColumn.gsm8k.name)
147
- drop = Task("drop", "f1", AutoEvalColumn.drop.name)
148
- nq = Task("nqopen", "em", AutoEvalColumn.nqopen.name)
149
 
150
 
151
  # Column selection
 
1
+ from dataclasses import dataclass, make_dataclass
2
  from enum import Enum
3
 
4
  import pandas as pd
5
 
6
+ def fields(raw_class):
7
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
8
+
9
+
10
+ @dataclass
11
+ class Task:
12
+ benchmark: str
13
+ metric: str
14
+ col_name: str
15
+
16
+ class Tasks(Enum):
17
+ arc = Task("arc:challenge", "acc_norm", "ARC")
18
+ hellaswag = Task("hellaswag", "acc_norm", "HellaSwag")
19
+ mmlu = Task("hendrycksTest", "acc", "MMLU")
20
+ truthfulqa = Task("truthfulqa:mc", "mc2", "TruthfulQA")
21
+ winogrande = Task("winogrande", "acc", "Winogrande")
22
+ gsm8k = Task("gsm8k", "acc", "GSM8K")
23
+ drop = Task("drop", "f1", "DROP")
24
 
25
  # These classes are for user facing column names,
26
  # to avoid having to change them all around the code
 
34
  never_hidden: bool = False
35
  dummy: bool = False
36
 
37
+ auto_eval_column_dict = []
38
+ # Init
39
+ auto_eval_column_dict.append(["model_type_symbol", ColumnContent, ColumnContent("T", "str", True, never_hidden=True)])
40
+ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
41
+ #Scores
42
+ auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
43
+ for task in Tasks:
44
+ auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
45
+ # Model information
46
+ auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
47
+ auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
48
+ auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
49
+ auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
50
+ auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
51
+ auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
52
+ auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
53
+ auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
54
+ auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
55
+ # Dummy column for the search bar (hidden by the custom CSS)
56
+ auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
57
+
58
+ # We use make dataclass to dynamically fill the scores from Tasks
59
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
 
 
 
 
 
 
 
 
 
60
 
61
  @dataclass(frozen=True)
62
  class EvalQueueColumn: # Queue column
 
110
  }
111
 
112
  @dataclass
113
+ class ModelDetails:
114
  name: str
115
+ symbol: str = "" # emoji, only for the model type
116
 
117
 
118
  class ModelType(Enum):
119
+ PT = ModelDetails(name="pretrained", symbol="🟢")
120
+ FT = ModelDetails(name="fine-tuned", symbol="🔶")
121
+ IFT = ModelDetails(name="instruction-tuned", symbol="���")
122
+ RL = ModelDetails(name="RL-tuned", symbol="🟦")
123
+ Unknown = ModelDetails(name="", symbol="?")
124
 
125
  def to_str(self, separator=" "):
126
  return f"{self.value.symbol}{separator}{self.value.name}"
 
137
  return ModelType.IFT
138
  return ModelType.Unknown
139
 
140
+ class WeightType(Enum):
141
+ Adapter = ModelDetails("Adapter")
142
+ Original = ModelDetails("Original")
143
+ Delta = ModelDetails("Delta")
144
+
145
+ class Precision(Enum):
146
+ float16 = ModelDetails("float16")
147
+ bfloat16 = ModelDetails("bfloat16")
148
+ qt_8bit = ModelDetails("8bit")
149
+ qt_4bit = ModelDetails("4bit")
150
+ qt_GPTQ = ModelDetails("GPTQ")
151
+ Unknown = ModelDetails("?")
152
+
153
+ def from_str(precision):
154
+ if precision in ["torch.float16", "float16"]:
155
+ return Precision.float16
156
+ if precision in ["torch.bfloat16", "bfloat16"]:
157
+ return Precision.bfloat16
158
+ if precision in ["8bit"]:
159
+ return Precision.qt_8bit
160
+ if precision in ["4bit"]:
161
+ return Precision.qt_4bit
162
+ if precision in ["GPTQ", "None"]:
163
+ return Precision.qt_GPTQ
164
+ return Precision.Unknown
165
+
166
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
 
169
  # Column selection
src/leaderboard/read_evals.py CHANGED
@@ -5,12 +5,12 @@ import os
5
  from dataclasses import dataclass
6
 
7
  import dateutil
8
- # from datetime import datetime
9
- # from transformers import AutoConfig
10
  import numpy as np
11
 
12
  from src.display.formatting import make_clickable_model
13
- from src.display.utils import AutoEvalColumn, ModelType, Tasks
14
  from src.submission.check_validity import is_model_on_hub
15
 
16
 
@@ -23,9 +23,9 @@ class EvalResult:
23
  model: str
24
  revision: str # commit hash, "" if main
25
  results: dict
26
- precision: str = ""
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
- weight_type: str = "Original" # Original or Adapter
29
  architecture: str = "Unknown" # From config file
30
  license: str = "?"
31
  likes: int = 0
@@ -43,9 +43,7 @@ class EvalResult:
43
  config = data.get("config", data.get("config_general", None))
44
 
45
  # Precision
46
- precision = config.get("model_dtype")
47
- if precision == "None":
48
- precision = "GPTQ"
49
 
50
  # Get model and org
51
  org_and_model = config.get("model_name", config.get("model_args", None))
@@ -54,15 +52,15 @@ class EvalResult:
54
  if len(org_and_model) == 1:
55
  org = None
56
  model = org_and_model[0]
57
- result_key = f"{model}_{precision}"
58
  else:
59
  org = org_and_model[0]
60
  model = org_and_model[1]
61
- result_key = f"{org}_{model}_{precision}"
62
  full_model = "/".join(org_and_model)
63
 
64
  still_on_hub, error, model_config = is_model_on_hub(
65
- full_model, config.get("model_sha", "main"), trust_remote_code=True
66
  )
67
  architecture = "?"
68
  if model_config is not None:
@@ -112,13 +110,13 @@ class EvalResult:
112
 
113
  def update_with_request_file(self, requests_path):
114
  """Finds the relevant request file for the current model and updates info with it"""
115
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
116
 
117
  try:
118
  with open(request_file, "r") as f:
119
  request = json.load(f)
120
  self.model_type = ModelType.from_str(request.get("model_type", ""))
121
- self.weight_type = request.get("weight_type", "?")
122
  self.license = request.get("license", "?")
123
  self.likes = request.get("likes", 0)
124
  self.num_params = request.get("params", 0)
@@ -131,10 +129,10 @@ class EvalResult:
131
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
132
  data_dict = {
133
  "eval_name": self.eval_name, # not a column, just a save name,
134
- AutoEvalColumn.precision.name: self.precision,
135
  AutoEvalColumn.model_type.name: self.model_type.value.name,
136
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
137
- AutoEvalColumn.weight_type.name: self.weight_type,
138
  AutoEvalColumn.architecture.name: self.architecture,
139
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
140
  AutoEvalColumn.dummy.name: self.full_model,
@@ -167,7 +165,7 @@ def get_request_file_for_model(requests_path, model_name, precision):
167
  with open(tmp_request_file, "r") as f:
168
  req_content = json.load(f)
169
  if (
170
- req_content["status"] in ["FINISHED", "PENDING_NEW_EVAL"]
171
  and req_content["precision"] == precision.split(".")[-1]
172
  ):
173
  request_file = tmp_request_file
 
5
  from dataclasses import dataclass
6
 
7
  import dateutil
8
+ from datetime import datetime
9
+ from transformers import AutoConfig
10
  import numpy as np
11
 
12
  from src.display.formatting import make_clickable_model
13
+ from src.display.utils import AutoEvalColumn, ModelType, Tasks, Precision, WeightType
14
  from src.submission.check_validity import is_model_on_hub
15
 
16
 
 
23
  model: str
24
  revision: str # commit hash, "" if main
25
  results: dict
26
+ precision: Precision = Precision.Unknown
27
  model_type: ModelType = ModelType.Unknown # Pretrained, fine tuned, ...
28
+ weight_type: WeightType = WeightType.Original # Original or Adapter
29
  architecture: str = "Unknown" # From config file
30
  license: str = "?"
31
  likes: int = 0
 
43
  config = data.get("config", data.get("config_general", None))
44
 
45
  # Precision
46
+ precision = Precision.from_str(config.get("model_dtype"))
 
 
47
 
48
  # Get model and org
49
  org_and_model = config.get("model_name", config.get("model_args", None))
 
52
  if len(org_and_model) == 1:
53
  org = None
54
  model = org_and_model[0]
55
+ result_key = f"{model}_{precision.value.name}"
56
  else:
57
  org = org_and_model[0]
58
  model = org_and_model[1]
59
+ result_key = f"{org}_{model}_{precision.value.name}"
60
  full_model = "/".join(org_and_model)
61
 
62
  still_on_hub, error, model_config = is_model_on_hub(
63
+ full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
64
  )
65
  architecture = "?"
66
  if model_config is not None:
 
110
 
111
  def update_with_request_file(self, requests_path):
112
  """Finds the relevant request file for the current model and updates info with it"""
113
+ request_file = get_request_file_for_model(requests_path, self.full_model, self.precision.value.name)
114
 
115
  try:
116
  with open(request_file, "r") as f:
117
  request = json.load(f)
118
  self.model_type = ModelType.from_str(request.get("model_type", ""))
119
+ self.weight_type = WeightType[request.get("weight_type", "Original")]
120
  self.license = request.get("license", "?")
121
  self.likes = request.get("likes", 0)
122
  self.num_params = request.get("params", 0)
 
129
  average = sum([v for v in self.results.values() if v is not None]) / len(Tasks)
130
  data_dict = {
131
  "eval_name": self.eval_name, # not a column, just a save name,
132
+ AutoEvalColumn.precision.name: self.precision.value.name,
133
  AutoEvalColumn.model_type.name: self.model_type.value.name,
134
  AutoEvalColumn.model_type_symbol.name: self.model_type.value.symbol,
135
+ AutoEvalColumn.weight_type.name: self.weight_type.value.name,
136
  AutoEvalColumn.architecture.name: self.architecture,
137
  AutoEvalColumn.model.name: make_clickable_model(self.full_model),
138
  AutoEvalColumn.dummy.name: self.full_model,
 
165
  with open(tmp_request_file, "r") as f:
166
  req_content = json.load(f)
167
  if (
168
+ req_content["status"] in ["FINISHED"]
169
  and req_content["precision"] == precision.split(".")[-1]
170
  ):
171
  request_file = tmp_request_file
src/populate.py CHANGED
@@ -21,13 +21,7 @@ def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchm
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
-
25
- # df = df[cols].round(decimals=2)
26
- for col in cols:
27
- if col in df.columns:
28
- df[col] = df[col].round(decimals=2)
29
- else:
30
- df[col] = 0.0
31
 
32
  # filter out if any of the benchmarks have not been produced
33
  df = df[has_no_nan_values(df, benchmark_cols)]
 
21
 
22
  df = pd.DataFrame.from_records(all_data_json)
23
  df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
24
+ df = df[cols].round(decimals=2)
 
 
 
 
 
 
25
 
26
  # filter out if any of the benchmarks have not been produced
27
  df = df[has_no_nan_values(df, benchmark_cols)]
src/submission/check_validity.py CHANGED
@@ -8,6 +8,7 @@ import huggingface_hub
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
 
11
 
12
  from src.envs import HAS_HIGHER_RATE_LIMIT
13
 
@@ -36,9 +37,24 @@ def check_model_card(repo_id: str) -> tuple[bool, str]:
36
  return True, ""
37
 
38
 
39
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False) -> tuple[bool, str]:
40
  try:
41
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  return True, None, config
43
 
44
  except ValueError:
@@ -48,7 +64,7 @@ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_rem
48
  None
49
  )
50
 
51
- except Exception:
52
  return False, "was not found on hub!", None
53
 
54
 
@@ -71,8 +87,7 @@ def get_model_size(model_info: ModelInfo, precision: str):
71
  def get_model_arch(model_info: ModelInfo):
72
  return model_info.config.get("architectures", "Unknown")
73
 
74
- def user_submission_permission(submission_name, users_to_submission_dates, rate_limit_period, rate_limit_quota):
75
- org_or_user, _ = submission_name.split("/")
76
  if org_or_user not in users_to_submission_dates:
77
  return True, ""
78
  submission_dates = sorted(users_to_submission_dates[org_or_user])
 
8
  from huggingface_hub import ModelCard
9
  from huggingface_hub.hf_api import ModelInfo
10
  from transformers import AutoConfig
11
+ from transformers.models.auto.tokenization_auto import tokenizer_class_from_name, get_tokenizer_config
12
 
13
  from src.envs import HAS_HIGHER_RATE_LIMIT
14
 
 
37
  return True, ""
38
 
39
 
40
+ def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
41
  try:
42
  config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
43
+ if test_tokenizer:
44
+ tokenizer_config = get_tokenizer_config(model_name)
45
+ if tokenizer_config is not None:
46
+ tokenizer_class_candidate = tokenizer_config.get("tokenizer_class", None)
47
+ else:
48
+ tokenizer_class_candidate = config.tokenizer_class
49
+
50
+
51
+ tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate)
52
+ if tokenizer_class is None:
53
+ return (
54
+ False,
55
+ f"uses {tokenizer_class_candidate}, which is not in a transformers release, therefore not supported at the moment.",
56
+ None
57
+ )
58
  return True, None, config
59
 
60
  except ValueError:
 
64
  None
65
  )
66
 
67
+ except Exception as e:
68
  return False, "was not found on hub!", None
69
 
70
 
 
87
  def get_model_arch(model_info: ModelInfo):
88
  return model_info.config.get("architectures", "Unknown")
89
 
90
+ def user_submission_permission(org_or_user, users_to_submission_dates, rate_limit_period, rate_limit_quota):
 
91
  if org_or_user not in users_to_submission_dates:
92
  return True, ""
93
  submission_dates = sorted(users_to_submission_dates[org_or_user])
src/submission/submit.py CHANGED
@@ -30,6 +30,11 @@ def add_new_eval(
30
  if not REQUESTED_MODELS:
31
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
32
 
 
 
 
 
 
33
 
34
  precision = precision.split(" ")[0]
35
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
@@ -38,11 +43,12 @@ def add_new_eval(
38
  return styled_error("Please select a model type.")
39
 
40
  # Is the user rate limited?
41
- user_can_submit, error_msg = user_submission_permission(
42
- model, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
43
- )
44
- if not user_can_submit:
45
- return styled_error(error_msg)
 
46
 
47
  # Did the model authors forbid its submission to the leaderboard?
48
  if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
@@ -54,12 +60,12 @@ def add_new_eval(
54
 
55
  # Is the model on the hub?
56
  if weight_type in ["Delta", "Adapter"]:
57
- base_model_on_hub, error, _ = is_model_on_hub(base_model, revision, H4_TOKEN)
58
  if not base_model_on_hub:
59
  return styled_error(f'Base model "{base_model}" {error}')
60
 
61
  if not weight_type == "Adapter":
62
- model_on_hub, error, _ = is_model_on_hub(model, revision)
63
  if not model_on_hub:
64
  return styled_error(f'Model "{model}" {error}')
65
 
@@ -99,12 +105,6 @@ def add_new_eval(
99
  "license": license,
100
  }
101
 
102
- user_name = ""
103
- model_path = model
104
- if "/" in model:
105
- user_name = model.split("/")[0]
106
- model_path = model.split("/")[1]
107
-
108
  # Check for duplicate submission
109
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
110
  return styled_warning("This model has been already submitted.")
 
30
  if not REQUESTED_MODELS:
31
  REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
32
 
33
+ user_name = ""
34
+ model_path = model
35
+ if "/" in model:
36
+ user_name = model.split("/")[0]
37
+ model_path = model.split("/")[1]
38
 
39
  precision = precision.split(" ")[0]
40
  current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
 
43
  return styled_error("Please select a model type.")
44
 
45
  # Is the user rate limited?
46
+ if user_name != "":
47
+ user_can_submit, error_msg = user_submission_permission(
48
+ user_name, USERS_TO_SUBMISSION_DATES, RATE_LIMIT_PERIOD, RATE_LIMIT_QUOTA
49
+ )
50
+ if not user_can_submit:
51
+ return styled_error(error_msg)
52
 
53
  # Did the model authors forbid its submission to the leaderboard?
54
  if model in DO_NOT_SUBMIT_MODELS or base_model in DO_NOT_SUBMIT_MODELS:
 
60
 
61
  # Is the model on the hub?
62
  if weight_type in ["Delta", "Adapter"]:
63
+ base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=H4_TOKEN, test_tokenizer=True)
64
  if not base_model_on_hub:
65
  return styled_error(f'Base model "{base_model}" {error}')
66
 
67
  if not weight_type == "Adapter":
68
+ model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, test_tokenizer=True)
69
  if not model_on_hub:
70
  return styled_error(f'Model "{model}" {error}')
71
 
 
105
  "license": license,
106
  }
107
 
 
 
 
 
 
 
108
  # Check for duplicate submission
109
  if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
110
  return styled_warning("This model has been already submitted.")