Jae-Won Chung commited on
Commit
81672d7
1 Parent(s): 36058af

More info in app and about page (#14)

Browse files
Files changed (2) hide show
  1. LEADERBOARD.md +15 -1
  2. app.py +112 -84
LEADERBOARD.md CHANGED
@@ -65,6 +65,20 @@ Find our benchmark script for one model [here](https://github.com/ml-energy/lead
65
  We randomly sampled around 3000 prompts from the [cleaned ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered).
66
  See [here](https://github.com/ml-energy/leaderboard/tree/master/sharegpt) for more detail on how we created the benchmark dataset.
67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  ## Contributing
69
 
70
  Any kind of contribution is more than welcome!
@@ -84,7 +98,7 @@ Hence, absolute latency, throughput, and energy numbers should not be used to es
84
 
85
  Batch size 1, in some sense, is the lowest possible hardware utilization.
86
  We'll soon benchmark batch sizes larger than 1 without continuous batching for comparison.
87
- This would show what happens in the case of very high hardware utilization (lest with PyTorch), assuming an ideal case where all sequences in each batch generates the same number of output tokens.
88
  By doing this, we can provide numbers for reasonable comparison without being tied to any existing generative model serving system.
89
 
90
  ## Upcoming
 
65
  We randomly sampled around 3000 prompts from the [cleaned ShareGPT dataset](https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered).
66
  See [here](https://github.com/ml-energy/leaderboard/tree/master/sharegpt) for more detail on how we created the benchmark dataset.
67
 
68
+ ## FAQ
69
+
70
+ ### So who's the winner?
71
+
72
+ It depends on which metric you value most.
73
+ Some may be tightly constrained by electricity consumption, in which case energy would have higher weight.
74
+ Some may just want better model quality, in which case the NLP dataset results will be important.
75
+ Others might want something balanced.
76
+ This is why we support adding custom columns to the table, and let you choose your own winner!
77
+
78
+ ### Where can I find more about ML energy-related resources?
79
+
80
+ Meet us at the [ML.ENERGY initiative](https://ml.energy) homepage!
81
+
82
  ## Contributing
83
 
84
  Any kind of contribution is more than welcome!
 
98
 
99
  Batch size 1, in some sense, is the lowest possible hardware utilization.
100
  We'll soon benchmark batch sizes larger than 1 without continuous batching for comparison.
101
+ This would show what happens in the case of very high hardware utilization (although it's with PyTorch), assuming an ideal case where all sequences in each batch generate the same number of output tokens.
102
  By doing this, we can provide numbers for reasonable comparison without being tied to any existing generative model serving system.
103
 
104
  ## Upcoming
app.py CHANGED
@@ -160,8 +160,8 @@ class TableManager:
160
  def get_dropdown(self):
161
  columns = self.full_df.columns.tolist()[1:]
162
  return [
163
- gr.Dropdown(choices=columns, label="X"),
164
- gr.Dropdown(choices=columns, label="Y"),
165
  gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
166
  ]
167
 
@@ -306,16 +306,36 @@ table th:first-child {
306
  overflow: auto;
307
  white-space: nowrap;
308
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  """
310
 
311
  block = gr.Blocks(css=css)
312
  with block:
313
  tbm = gr.State(global_tbm) # type: ignore
314
- gr.HTML("<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>")
 
315
 
316
  with gr.Tabs():
317
  # Tab 1: Leaderboard.
318
- with gr.TabItem("Leaderboard"):
 
 
 
319
  # Block 1: Checkboxes to select benchmarking parameters.
320
  with gr.Row():
321
  with gr.Box():
@@ -335,96 +355,104 @@ with block:
335
  checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe)
336
 
337
  # Block 3: Allow users to add new columns.
338
- gr.Markdown("---\n### Add custom columns to the table")
339
- with gr.Row():
340
- with gr.Column(scale=3):
341
- with gr.Row():
342
- colname_input = gr.Textbox(lines=1, label="Custom column name")
343
- formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
344
- with gr.Column(scale=1):
345
- with gr.Row():
346
- add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
347
- with gr.Row():
348
- clear_input_btn = gr.Button("Clear")
349
- with gr.Row():
350
- add_col_message = gr.HTML("")
351
- gr.Examples(
352
- examples=[
353
- ["power", "energy / latency"],
354
- ["token_per_joule", "response_length / energy"],
355
- ["verbose", "response_length > @sum(response_length) / @len(response_length)"],
356
- ],
357
- inputs=[colname_input, formula_input],
358
- )
359
- colname_input.submit(
360
- TableManager.add_column,
361
- inputs=[tbm, colname_input, formula_input],
362
- outputs=[dataframe, add_col_message],
363
- )
364
- formula_input.submit(
365
- TableManager.add_column,
366
- inputs=[tbm, colname_input, formula_input],
367
- outputs=[dataframe, add_col_message],
368
- )
369
- add_col_btn.click(
370
- TableManager.add_column,
371
- inputs=[tbm, colname_input, formula_input],
372
- outputs=[dataframe, add_col_message],
373
- )
374
- clear_input_btn.click(
375
- lambda: (None, None, None),
376
- inputs=None,
377
- outputs=[colname_input, formula_input, add_col_message],
378
- )
 
379
 
380
  # Block 4: Allow users to plot 2D and 3D scatter plots.
381
- gr.Markdown("---\n### Scatter plot (Hover over marker to show model name)")
382
- with gr.Row():
383
- with gr.Column(scale=3):
384
- with gr.Row():
385
- # Initialize the dropdown choices with the global TableManager with just the original columns.
386
- axis_dropdowns = global_tbm.get_dropdown()
387
- with gr.Column(scale=1):
388
- with gr.Row():
389
- plot_btn = gr.Button("Plot", elem_classes=["btn-submit"])
 
 
 
 
390
  with gr.Row():
391
- clear_plot_btn = gr.Button("Clear")
392
- with gr.Accordion("Plot size (600 x 600 by default)", open=False):
393
  with gr.Row():
394
- plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
395
- plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
396
- with gr.Row():
397
- plot = gr.Plot()
398
- with gr.Row():
399
- plot_message = gr.HTML("")
400
- add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
401
- plot_width_input.submit(
402
- TableManager.plot_scatter,
403
- inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
404
- outputs=[plot, plot_width_input, plot_height_input, plot_message],
405
- )
406
- plot_height_input.submit(
407
- TableManager.plot_scatter,
408
- inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
409
- outputs=[plot, plot_width_input, plot_height_input, plot_message],
410
- )
411
- plot_btn.click(
412
- TableManager.plot_scatter,
413
- inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
414
- outputs=[plot, plot_width_input, plot_height_input, plot_message],
415
- )
416
- clear_plot_btn.click(
417
- lambda: (None,) * 7,
418
- None,
419
- outputs=[*axis_dropdowns, plot, plot_width_input, plot_height_input, plot_message],
420
- )
 
 
 
421
 
422
  # Block 5: Leaderboard date.
423
  with gr.Row():
424
  gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
425
 
426
  # Tab 2: About page.
427
- with gr.TabItem("About"):
428
  # Read in LEADERBOARD.md
429
  gr.Markdown(open("LEADERBOARD.md").read())
430
 
 
160
  def get_dropdown(self):
161
  columns = self.full_df.columns.tolist()[1:]
162
  return [
163
+ gr.Dropdown(choices=columns, value="parameters", label="X"),
164
+ gr.Dropdown(choices=columns, value="energy", label="Y"),
165
  gr.Dropdown(choices=["None", *columns], label="Z (optional)"),
166
  ]
167
 
 
306
  overflow: auto;
307
  white-space: nowrap;
308
  }
309
+
310
+ /* Make tab buttons larger */
311
+ .tab-nav > button {
312
+ font-size: 18px !important;
313
+ }
314
+ """
315
+
316
+ intro_text = """
317
+ <h2>How much energy do modern Large Language Models (LLMs) consume for inference?</h2>
318
+
319
+ <p style="font-size: 16px">We used <a href="https://ml.energy/zeus">Zeus</a> to benchmark various open source LLMs in terms of how much time and energy they consume for inference.
320
+ Time and energy are of course not the only things we care about -- so we also benchmarked all of the models on a variety of NLP datasets,
321
+ including the ARC Challenge (reasoning), HellaSwag (common sense), and TruthfulQA (truthfulness).</p>
322
+
323
+ <p style="font-size: 16px">For more detailed information, please take a look at the <b>About</b> tab.
324
+ Every benchmark is limited in some sense -- Before you interpret the results, please take a look at the *Limitations* section there, too.</p>
325
  """
326
 
327
  block = gr.Blocks(css=css)
328
  with block:
329
  tbm = gr.State(global_tbm) # type: ignore
330
+ with gr.Box():
331
+ gr.HTML("<h1><a href='https://ml.energy' class='text-logo'>ML.ENERGY</a> Leaderboard</h1>")
332
 
333
  with gr.Tabs():
334
  # Tab 1: Leaderboard.
335
+ with gr.Tab("Leaderboard"):
336
+ with gr.Box():
337
+ gr.HTML(intro_text)
338
+
339
  # Block 1: Checkboxes to select benchmarking parameters.
340
  with gr.Row():
341
  with gr.Box():
 
355
  checkbox.change(TableManager.set_filter_get_df, inputs=[tbm, *checkboxes], outputs=dataframe)
356
 
357
  # Block 3: Allow users to add new columns.
358
+ with gr.Box():
359
+ gr.Markdown("### Add custom columns to the table")
360
+ with gr.Row():
361
+ with gr.Column(scale=3):
362
+ with gr.Row():
363
+ colname_input = gr.Textbox(lines=1, label="Custom column name")
364
+ formula_input = gr.Textbox(lines=1, label="Formula (@sum, @len, @max, and @min are supported)")
365
+ with gr.Column(scale=1):
366
+ with gr.Row():
367
+ add_col_btn = gr.Button("Add to table (⏎)", elem_classes=["btn-submit"])
368
+ with gr.Row():
369
+ clear_input_btn = gr.Button("Clear")
370
+ with gr.Row():
371
+ add_col_message = gr.HTML("")
372
+ gr.Examples(
373
+ examples=[
374
+ ["power", "energy / latency"],
375
+ ["token_per_joule", "response_length / energy"],
376
+ ["verbose", "response_length > @sum(response_length) / @len(response_length)"],
377
+ ],
378
+ inputs=[colname_input, formula_input],
379
+ )
380
+ colname_input.submit(
381
+ TableManager.add_column,
382
+ inputs=[tbm, colname_input, formula_input],
383
+ outputs=[dataframe, add_col_message],
384
+ )
385
+ formula_input.submit(
386
+ TableManager.add_column,
387
+ inputs=[tbm, colname_input, formula_input],
388
+ outputs=[dataframe, add_col_message],
389
+ )
390
+ add_col_btn.click(
391
+ TableManager.add_column,
392
+ inputs=[tbm, colname_input, formula_input],
393
+ outputs=[dataframe, add_col_message],
394
+ )
395
+ clear_input_btn.click(
396
+ lambda: (None, None, None),
397
+ inputs=None,
398
+ outputs=[colname_input, formula_input, add_col_message],
399
+ )
400
 
401
  # Block 4: Allow users to plot 2D and 3D scatter plots.
402
+ with gr.Box():
403
+ gr.Markdown("### Scatter plot (Hover over marker to show model name)")
404
+ with gr.Row():
405
+ with gr.Column(scale=3):
406
+ with gr.Row():
407
+ # Initialize the dropdown choices with the global TableManager with just the original columns.
408
+ axis_dropdowns = global_tbm.get_dropdown()
409
+ with gr.Column(scale=1):
410
+ with gr.Row():
411
+ plot_btn = gr.Button("Plot", elem_classes=["btn-submit"])
412
+ with gr.Row():
413
+ clear_plot_btn = gr.Button("Clear")
414
+ with gr.Accordion("Plot size (600 x 600 by default)", open=False):
415
  with gr.Row():
416
+ plot_width_input = gr.Textbox("600", lines=1, label="Width (px)")
417
+ plot_height_input = gr.Textbox("600", lines=1, label="Height (px)")
418
  with gr.Row():
419
+ plot = gr.Plot(value=global_tbm.plot_scatter(
420
+ plot_width_input.value,
421
+ plot_height_input.value,
422
+ x=axis_dropdowns[0].value,
423
+ y=axis_dropdowns[1].value,
424
+ z=axis_dropdowns[2].value,
425
+ )[0]) # type: ignore
426
+ with gr.Row():
427
+ plot_message = gr.HTML("")
428
+ add_col_btn.click(TableManager.update_dropdown, inputs=tbm, outputs=axis_dropdowns) # type: ignore
429
+ plot_width_input.submit(
430
+ TableManager.plot_scatter,
431
+ inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
432
+ outputs=[plot, plot_width_input, plot_height_input, plot_message],
433
+ )
434
+ plot_height_input.submit(
435
+ TableManager.plot_scatter,
436
+ inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
437
+ outputs=[plot, plot_width_input, plot_height_input, plot_message],
438
+ )
439
+ plot_btn.click(
440
+ TableManager.plot_scatter,
441
+ inputs=[tbm, plot_width_input, plot_height_input, *axis_dropdowns],
442
+ outputs=[plot, plot_width_input, plot_height_input, plot_message],
443
+ )
444
+ clear_plot_btn.click(
445
+ lambda: (None,) * 7,
446
+ None,
447
+ outputs=[*axis_dropdowns, plot, plot_width_input, plot_height_input, plot_message],
448
+ )
449
 
450
  # Block 5: Leaderboard date.
451
  with gr.Row():
452
  gr.HTML(f"<h3 style='color: gray'>Last updated: {current_date}</h3>")
453
 
454
  # Tab 2: About page.
455
+ with gr.Tab("About"):
456
  # Read in LEADERBOARD.md
457
  gr.Markdown(open("LEADERBOARD.md").read())
458