Jae-Won Chung commited on
Commit
e9693d3
1 Parent(s): b5a071f

No composite metrics by default, add examples

Browse files
Files changed (2) hide show
  1. LEADERBOARD.md +0 -8
  2. app.py +27 -26
LEADERBOARD.md CHANGED
@@ -1,18 +1,10 @@
1
  The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
2
 
3
- ## How is energy different?
4
-
5
- The energy consumption of running inference depends on factors such as model architecture, size, and GPU model.
6
- However, even if we run models with the exact same architecture and size on the same GPU, the average energy consumption **per prompt** is different because different models have **different verbosity**.
7
- That is, when asked the same thing, different models answer in different lengths.
8
-
9
  ## Columns
10
 
11
  - `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
12
  - `task`: Name of the task. See *Tasks* below for details.
13
- - `energy_eff`: Our definition of energy efficiency: Average NLP evaluation metric attained per Joule of energy (`nlp_average / energy`).
14
  - `energy` (J): The average energy consumed by the model to generate a response.
15
- - `nlp_average`: The arithmetic average of the NLP evaluation metrics we obtained. See *NLP evaluation metrics* below for details.
16
  - `throughput` (token/s): The average number of tokens generated per second.
17
  - `latency` (s): The average time it took for the model to generate a response.
18
  - `response_length` (token): The average number of tokens in the model's response.
 
1
  The goal of the ML.ENERGY Leaderboard is to give people a sense of how much **energy** LLMs would consume.
2
 
 
 
 
 
 
 
3
  ## Columns
4
 
5
  - `gpu`: NVIDIA GPU model name. Note that NLP evaluation was only run once on our A40 GPUs, so this column only changes system-level measurements like latency and energy.
6
  - `task`: Name of the task. See *Tasks* below for details.
 
7
  - `energy` (J): The average energy consumed by the model to generate a response.
 
8
  - `throughput` (token/s): The average number of tokens generated per second.
9
  - `latency` (s): The average time it took for the model to generate a response.
10
  - `response_length` (token): The average number of tokens in the model's response.
app.py CHANGED
@@ -35,7 +35,7 @@ class TableManager:
35
  df["model"] = df["model"].apply(format_model_link)
36
 
37
  # Sort by our 'energy efficiency' score.
38
- df = df.sort_values(by="energy_eff", ascending=False)
39
 
40
  # The full table where all the data are.
41
  self.full_df = df
@@ -47,11 +47,6 @@ class TableManager:
47
  """Read tables."""
48
  df_score = pd.read_csv(f"{data_dir}/score.csv")
49
 
50
- # Compute average NLP metrics
51
- columns = df_score.columns.to_list()
52
- columns.remove("model")
53
- df_score["nlp_average"] = df_score[columns].mean(axis=1)
54
-
55
  with open(f"{data_dir}/schema.yaml") as file:
56
  self.schema: dict[str, list] = yaml.safe_load(file)
57
 
@@ -71,16 +66,12 @@ class TableManager:
71
  raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
72
 
73
  df = pd.merge(res_df, df_score, on=["model"]).round(2)
74
-
75
- # Energy efficiency is defined as the amount of average NLP performance
76
- # the model gets per Joule of energy.
77
- df["energy_eff"] = (df["nlp_average"] / df["energy"]).round(4)
78
 
79
  # Order columns.
80
  columns = df.columns.to_list()
81
  cols_to_order = ["model"]
82
  cols_to_order.extend(self.schema.keys())
83
- cols_to_order.extend(["energy_eff", "energy", "nlp_average"])
84
  columns = cols_to_order + [col for col in columns if col not in cols_to_order]
85
  df = df[columns]
86
 
@@ -118,10 +109,19 @@ class TableManager:
118
 
119
  # Evaluate the formula and catch any error.
120
  try:
121
- col = self.full_df.eval(formula)
122
- if isinstance(col, pd.Series):
 
 
 
 
 
 
123
  col = col.round(2)
124
- self.full_df[column_name] = col
 
 
 
125
  except Exception as exc:
126
  return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
127
 
@@ -132,8 +132,8 @@ class TableManager:
132
  def get_dropdown(self):
133
  columns = self.full_df.columns.tolist()[1:]
134
  return [
135
- gr.Dropdown(value="nlp_average", choices=columns, label="X"),
136
- gr.Dropdown(value="energy_eff", choices=columns, label="Y"),
137
  gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
138
  ]
139
 
@@ -303,8 +303,8 @@ with block:
303
  with gr.Row():
304
  with gr.Column(scale=3):
305
  with gr.Row():
306
- colname_input = gr.Textbox("power", lines=1, label="Custom column name")
307
- formula_input = gr.Textbox("energy/latency", lines=1, label="Formula")
308
  with gr.Column(scale=1):
309
  with gr.Row():
310
  add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
@@ -312,6 +312,14 @@ with block:
312
  clear_input_btn = gr.Button("Clear")
313
  with gr.Row():
314
  add_col_message = gr.HTML("")
 
 
 
 
 
 
 
 
315
  colname_input.submit(
316
  TableManager.add_column,
317
  inputs=[tbm, colname_input, formula_input],
@@ -349,14 +357,7 @@ with block:
349
  plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
350
  plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
351
  with gr.Row():
352
- # By default show a plot of average model quality vs energy consumption.
353
- plot = gr.Plot(global_tbm.plot_scatter(
354
- width=plot_width_input.value,
355
- height=plot_height_input.value,
356
- x=axis_dropdowns[0].value,
357
- y=axis_dropdowns[1].value,
358
- z=axis_dropdowns[2].value,
359
- )[0])
360
  with gr.Row():
361
  plot_message = gr.HTML("")
362
  add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
 
35
  df["model"] = df["model"].apply(format_model_link)
36
 
37
  # Sort by our 'energy efficiency' score.
38
+ df = df.sort_values(by="energy", ascending=True)
39
 
40
  # The full table where all the data are.
41
  self.full_df = df
 
47
  """Read tables."""
48
  df_score = pd.read_csv(f"{data_dir}/score.csv")
49
 
 
 
 
 
 
50
  with open(f"{data_dir}/schema.yaml") as file:
51
  self.schema: dict[str, list] = yaml.safe_load(file)
52
 
 
66
  raise ValueError(f"No benchmark CSV files were read from {data_dir=}.")
67
 
68
  df = pd.merge(res_df, df_score, on=["model"]).round(2)
 
 
 
 
69
 
70
  # Order columns.
71
  columns = df.columns.to_list()
72
  cols_to_order = ["model"]
73
  cols_to_order.extend(self.schema.keys())
74
+ cols_to_order.append("energy")
75
  columns = cols_to_order + [col for col in columns if col not in cols_to_order]
76
  df = df[columns]
77
 
 
109
 
110
  # Evaluate the formula and catch any error.
111
  try:
112
+ # Give the users some helper functions that can be used in the formula
113
+ # like "@sum(response_length)".
114
+ col = self.full_df.eval(
115
+ formula,
116
+ local_dict={"sum": sum, "len": len, "max": max, "min": min},
117
+ )
118
+ # Only round floating point columns.
119
+ if isinstance(col, pd.Series) and col.dtype.kind == "f":
120
  col = col.round(2)
121
+ if column_name in self.full_df.columns:
122
+ self.full_df[column_name] = col
123
+ else:
124
+ self.full_df.insert(len(self.schema) + 1, column_name, col)
125
  except Exception as exc:
126
  return self.cur_df, self._format_msg(f"Invalid formula: {exc}")
127
 
 
132
  def get_dropdown(self):
133
  columns = self.full_df.columns.tolist()[1:]
134
  return [
135
+ gr.Dropdown(choices=columns, label="X"),
136
+ gr.Dropdown(choices=columns, label="Y"),
137
  gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
138
  ]
139
 
 
303
  with gr.Row():
304
  with gr.Column(scale=3):
305
  with gr.Row():
306
+ colname_input = gr.Textbox(lines=1, label="Custom column name")
307
+ formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
308
  with gr.Column(scale=1):
309
  with gr.Row():
310
  add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
 
312
  clear_input_btn = gr.Button("Clear")
313
  with gr.Row():
314
  add_col_message = gr.HTML("")
315
+ gr.Examples(
316
+ examples=[
317
+ ["power", "energy / latency"],
318
+ ["token_per_joule", "response_length / energy"],
319
+ ["verbose", "response_length > @sum(response_length) / @len(response_length)"],
320
+ ],
321
+ inputs=[colname_input, formula_input],
322
+ )
323
  colname_input.submit(
324
  TableManager.add_column,
325
  inputs=[tbm, colname_input, formula_input],
 
357
  plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
358
  plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
359
  with gr.Row():
360
+ plot = gr.Plot()
 
 
 
 
 
 
 
361
  with gr.Row():
362
  plot_message = gr.HTML("")
363
  add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore