jinsol-neubla commited on
Commit
8cc8a87
1 Parent(s): 73dcc35

Add FP8 and fake_quant filter

Browse files

Signed-off-by: jinsol-neubla <jinsol.kim@neubla.com>

Files changed (3) hide show
  1. app.py +20 -3
  2. src/display/utils.py +17 -0
  3. src/leaderboard/read_evals.py +24 -15
app.py CHANGED
@@ -3,7 +3,7 @@ import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
- from gradio_space_ci import enable_space_ci
7
 
8
  from src.display.about import (
9
  INTRODUCTION_TEXT,
@@ -25,6 +25,7 @@ from src.display.utils import (
25
  fields,
26
  WeightType,
27
  Precision,
 
28
  )
29
  from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, REPO_ID, HF_TOKEN
30
  from src.populate import get_leaderboard_df
@@ -84,6 +85,7 @@ def update_table(
84
  activation_precision_query: str,
85
  size_query: list,
86
  hide_models: list,
 
87
  query: str,
88
  ):
89
  filtered_df = filter_models(
@@ -93,6 +95,7 @@ def update_table(
93
  weight_precision_query=weight_precision_query,
94
  activation_precision_query=activation_precision_query,
95
  hide_models=hide_models,
 
96
  )
97
  filtered_df = filter_queries(query, filtered_df)
98
  df = select_columns(filtered_df, columns)
@@ -153,6 +156,7 @@ def filter_models(
153
  weight_precision_query: list,
154
  activation_precision_query: list,
155
  hide_models: list,
 
156
  ) -> pd.DataFrame:
157
  # Show all models
158
  if "Private or deleted" in hide_models:
@@ -175,6 +179,7 @@ def filter_models(
175
  filtered_df = filtered_df.loc[
176
  df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
177
  ]
 
178
 
179
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
180
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
@@ -191,6 +196,7 @@ leaderboard_df = filter_models(
191
  weight_precision_query=[i.value.name for i in Precision],
192
  activation_precision_query=[i.value.name for i in Precision],
193
  hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
 
194
  )
195
 
196
  demo = gr.Blocks(css=custom_css)
@@ -227,7 +233,7 @@ with demo:
227
  with gr.Row():
228
  hide_models = gr.CheckboxGroup(
229
  label="Hide models",
230
- choices=["Private or deleted", "Contains a merge/moerge", "Flagged", "MoE"],
231
  value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
232
  interactive=True,
233
  )
@@ -261,6 +267,13 @@ with demo:
261
  interactive=True,
262
  elem_id="filter-columns-size",
263
  )
 
 
 
 
 
 
 
264
 
265
  leaderboard_table = gr.components.Dataframe(
266
  value=leaderboard_df[
@@ -293,6 +306,7 @@ with demo:
293
  filter_columns_activation_precision,
294
  filter_columns_size,
295
  hide_models,
 
296
  search_bar,
297
  ],
298
  leaderboard_table,
@@ -310,6 +324,7 @@ with demo:
310
  filter_columns_activation_precision,
311
  filter_columns_size,
312
  hide_models,
 
313
  search_bar,
314
  ],
315
  leaderboard_table,
@@ -324,6 +339,7 @@ with demo:
324
  filter_columns_activation_precision,
325
  filter_columns_size,
326
  hide_models,
 
327
  ]:
328
  selector.change(
329
  update_table,
@@ -335,6 +351,7 @@ with demo:
335
  filter_columns_activation_precision,
336
  filter_columns_size,
337
  hide_models,
 
338
  search_bar,
339
  ],
340
  leaderboard_table,
@@ -374,4 +391,4 @@ scheduler = BackgroundScheduler()
374
  scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
375
  scheduler.start()
376
 
377
- demo.queue(default_concurrency_limit=40).launch()
 
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
5
  from huggingface_hub import snapshot_download
6
+ # from gradio_space_ci import enable_space_ci
7
 
8
  from src.display.about import (
9
  INTRODUCTION_TEXT,
 
25
  fields,
26
  WeightType,
27
  Precision,
28
+ Format
29
  )
30
  from src.envs import API, EVAL_RESULTS_PATH, RESULTS_REPO, REPO_ID, HF_TOKEN
31
  from src.populate import get_leaderboard_df
 
85
  activation_precision_query: str,
86
  size_query: list,
87
  hide_models: list,
88
+ format_query: list,
89
  query: str,
90
  ):
91
  filtered_df = filter_models(
 
95
  weight_precision_query=weight_precision_query,
96
  activation_precision_query=activation_precision_query,
97
  hide_models=hide_models,
98
+ format_query=format_query,
99
  )
100
  filtered_df = filter_queries(query, filtered_df)
101
  df = select_columns(filtered_df, columns)
 
156
  weight_precision_query: list,
157
  activation_precision_query: list,
158
  hide_models: list,
159
+ format_query: list,
160
  ) -> pd.DataFrame:
161
  # Show all models
162
  if "Private or deleted" in hide_models:
 
179
  filtered_df = filtered_df.loc[
180
  df[AutoEvalColumn.activation_precision.name].isin(activation_precision_query + ["None"])
181
  ]
182
+ filtered_df = filtered_df.loc[df[AutoEvalColumn.format.name].isin(format_query)]
183
 
184
  numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
185
  params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
 
196
  weight_precision_query=[i.value.name for i in Precision],
197
  activation_precision_query=[i.value.name for i in Precision],
198
  hide_models=["Private or deleted", "Contains a merge/moerge", "Flagged"], # Deleted, merges, flagged, MoEs
199
+ format_query=[i.value.name for i in Format],
200
  )
201
 
202
  demo = gr.Blocks(css=custom_css)
 
233
  with gr.Row():
234
  hide_models = gr.CheckboxGroup(
235
  label="Hide models",
236
+ choices=["Private or deleted", "Contains a merge/moerge", "Flagged"], #, "MoE"],
237
  value=["Private or deleted", "Contains a merge/moerge", "Flagged"],
238
  interactive=True,
239
  )
 
267
  interactive=True,
268
  elem_id="filter-columns-size",
269
  )
270
+ filter_format = gr.CheckboxGroup(
271
+ label="Format",
272
+ choices=[i.value.name for i in Format],
273
+ value=[i.value.name for i in Format],
274
+ interactive=True,
275
+ elem_id="filter-format",
276
+ )
277
 
278
  leaderboard_table = gr.components.Dataframe(
279
  value=leaderboard_df[
 
306
  filter_columns_activation_precision,
307
  filter_columns_size,
308
  hide_models,
309
+ filter_format,
310
  search_bar,
311
  ],
312
  leaderboard_table,
 
324
  filter_columns_activation_precision,
325
  filter_columns_size,
326
  hide_models,
327
+ filter_format,
328
  search_bar,
329
  ],
330
  leaderboard_table,
 
339
  filter_columns_activation_precision,
340
  filter_columns_size,
341
  hide_models,
342
+ filter_format,
343
  ]:
344
  selector.change(
345
  update_table,
 
351
  filter_columns_activation_precision,
352
  filter_columns_size,
353
  hide_models,
354
+ filter_format,
355
  search_bar,
356
  ],
357
  leaderboard_table,
 
391
  scheduler.add_job(restart_space, "interval", seconds=1800) # restarted every 3h
392
  scheduler.start()
393
 
394
+ demo.queue(default_concurrency_limit=40).launch(share=True)
src/display/utils.py CHANGED
@@ -66,6 +66,7 @@ auto_eval_column_dict.append(["flagged", ColumnContent, ColumnContent("Flagged",
66
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
67
  # Dummy column for the search bar (hidden by the custom CSS)
68
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
 
69
 
70
  # We use make dataclass to dynamically fill the scores from Tasks
71
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
@@ -166,7 +167,9 @@ class Precision(Enum):
166
  float32 = ModelDetails("float32")
167
  float16 = ModelDetails("float16")
168
  bfloat16 = ModelDetails("bfloat16")
 
169
  int4 = ModelDetails("int4")
 
170
  Unknown = ModelDetails("?")
171
 
172
  def from_str(precision):
@@ -174,11 +177,25 @@ class Precision(Enum):
174
  return Precision.float16
175
  if precision in ["torch.bfloat16", "bfloat16"]:
176
  return Precision.bfloat16
 
 
177
  if precision in ["int4"]:
178
  return Precision.int4
 
 
179
  if precision in ["torch.float32", "float32"]:
180
  return Precision.float32
181
  return Precision.Unknown
 
 
 
 
 
 
 
 
 
 
182
 
183
 
184
  # Column selection
 
66
  auto_eval_column_dict.append(["moe", ColumnContent, ColumnContent("MoE", "bool", False, hidden=True)])
67
  # Dummy column for the search bar (hidden by the custom CSS)
68
  auto_eval_column_dict.append(["dummy", ColumnContent, ColumnContent("model_name_for_query", "str", False, dummy=True)])
69
+ auto_eval_column_dict.append(["format", ColumnContent, ColumnContent("Format", "str", False)])
70
 
71
  # We use make dataclass to dynamically fill the scores from Tasks
72
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
 
167
  float32 = ModelDetails("float32")
168
  float16 = ModelDetails("float16")
169
  bfloat16 = ModelDetails("bfloat16")
170
+ int8 = ModelDetails("int8")
171
  int4 = ModelDetails("int4")
172
+ float8 = ModelDetails("float8")
173
  Unknown = ModelDetails("?")
174
 
175
  def from_str(precision):
 
177
  return Precision.float16
178
  if precision in ["torch.bfloat16", "bfloat16"]:
179
  return Precision.bfloat16
180
+ if precision in ["int8"]:
181
+ return Precision.int8
182
  if precision in ["int4"]:
183
  return Precision.int4
184
+ if precision in ["float8", "fp8"]:
185
+ return Precision.float8
186
  if precision in ["torch.float32", "float32"]:
187
  return Precision.float32
188
  return Precision.Unknown
189
+
190
+
191
+ class Format(Enum):
192
+ FakeQuant = ModelDetails("FAKE_QUANT")
193
+ Unknown = ModelDetails("None")
194
+
195
+ def from_str(format):
196
+ if format in ["FAKE_QUANT"]:
197
+ return Format.FakeQuant
198
+ return Format.Unknown
199
 
200
 
201
  # Column selection
src/leaderboard/read_evals.py CHANGED
@@ -36,6 +36,7 @@ class EvalResult:
36
  flagged: bool = False
37
  status: str = "FINISHED"
38
  tags: list = None
 
39
 
40
  @classmethod
41
  def init_from_json_file(self, json_filepath):
@@ -61,6 +62,8 @@ class EvalResult:
61
  weight_precision = Precision.from_str(config.get("weight_precision"))
62
  activation_precision = Precision.from_str(config.get("activation_precision"))
63
 
 
 
64
  # Get model and org
65
  org_and_model = config.get("model")
66
  org_and_model = org_and_model.split("/", 1)
@@ -78,25 +81,29 @@ class EvalResult:
78
  # Extract results available in this file (some results are split in several files)
79
  results = {}
80
  for task in Tasks:
81
- task = task.value
82
- # We skip old mmlu entries
83
- # Some truthfulQA values are NaNs
84
- if task.benchmark == "truthfulqa_mc2" and "truthfulqa_mc2|0" in data["results"]:
85
- if math.isnan(float(data["results"]["truthfulqa_mc2|0"][task.metric])):
86
- results[task.benchmark] = 0.0
 
 
 
 
 
 
 
 
 
87
  continue
88
 
89
- # We average all scores of a given metric (mostly for mmlu)
90
- if task.benchmark == "mmlu":
91
- accs = np.array([data["results"].get(task.benchmark).get(task.metric, None)])
92
- else:
93
- accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
94
- if accs.size == 0 or any([acc is None for acc in accs]):
95
  continue
96
 
97
- mean_acc = np.mean(accs) * 100.0
98
- results[task.benchmark] = mean_acc
99
-
100
  return self(
101
  eval_name=result_key,
102
  full_model=full_model,
@@ -112,6 +119,7 @@ class EvalResult:
112
  date=date,
113
  architecture=architecture,
114
  tags=tags,
 
115
  )
116
 
117
  # def update_with_request_file(self, requests_path):
@@ -160,6 +168,7 @@ class EvalResult:
160
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
161
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
162
  AutoEvalColumn.flagged.name: self.flagged,
 
163
  }
164
 
165
  for task in Tasks:
 
36
  flagged: bool = False
37
  status: str = "FINISHED"
38
  tags: list = None
39
+ format: str = None
40
 
41
  @classmethod
42
  def init_from_json_file(self, json_filepath):
 
62
  weight_precision = Precision.from_str(config.get("weight_precision"))
63
  activation_precision = Precision.from_str(config.get("activation_precision"))
64
 
65
+ format = config.get("format", "None")
66
+
67
  # Get model and org
68
  org_and_model = config.get("model")
69
  org_and_model = org_and_model.split("/", 1)
 
81
  # Extract results available in this file (some results are split in several files)
82
  results = {}
83
  for task in Tasks:
84
+ try:
85
+ task = task.value
86
+ # We skip old mmlu entries
87
+ # Some truthfulQA values are NaNs
88
+ if task.benchmark == "truthfulqa_mc2" and "truthfulqa_mc2|0" in data["results"]:
89
+ if math.isnan(float(data["results"]["truthfulqa_mc2|0"][task.metric])):
90
+ results[task.benchmark] = 0.0
91
+ continue
92
+
93
+ # We average all scores of a given metric (mostly for mmlu)
94
+ if task.benchmark == "mmlu":
95
+ accs = np.array([data["results"].get(task.benchmark, {}).get(task.metric, None)])
96
+ else:
97
+ accs = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark in k])
98
+ if accs.size == 0 or any([acc is None for acc in accs]):
99
  continue
100
 
101
+ mean_acc = np.mean(accs) * 100.0
102
+ results[task.benchmark] = mean_acc
103
+ except Exception as e:
104
+ print(e)
 
 
105
  continue
106
 
 
 
 
107
  return self(
108
  eval_name=result_key,
109
  full_model=full_model,
 
119
  date=date,
120
  architecture=architecture,
121
  tags=tags,
122
+ format=format,
123
  )
124
 
125
  # def update_with_request_file(self, requests_path):
 
168
  AutoEvalColumn.merged.name: "merge" in self.tags if self.tags else False,
169
  AutoEvalColumn.moe.name: ("moe" in self.tags if self.tags else False) or "moe" in self.full_model.lower(),
170
  AutoEvalColumn.flagged.name: self.flagged,
171
+ AutoEvalColumn.format.name: self.format,
172
  }
173
 
174
  for task in Tasks: