BenchmarkBot commited on
Commit
67cbded
β€’
1 Parent(s): bf0a261

made scores clickable

Browse files
Files changed (3) hide show
  1. app.py +9 -30
  2. src/assets/css_html_js.py +0 -36
  3. src/assets/text_content.py +7 -9
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import os
2
- import json
3
  import gradio as gr
4
  import pandas as pd
5
  from apscheduler.schedulers.background import BackgroundScheduler
@@ -21,7 +20,7 @@ COLUMNS_MAPPING = {
21
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
22
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
23
  }
24
- COLUMNS_DATATYPES = ["markdown", "str", "str", "number", "number", "number"]
25
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
26
 
27
 
@@ -39,8 +38,8 @@ def get_benchmark_df(benchmark):
39
  scores_df = pd.read_csv(
40
  f"./llm-perf-dataset/reports/average_scores.csv")
41
  bench_df = bench_df.merge(scores_df, on="model", how="left")
42
- # bench_df["average"] = bench_df["average"].apply(
43
- # make_clickable_score)
44
 
45
  # preprocess
46
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
@@ -54,33 +53,19 @@ def get_benchmark_df(benchmark):
54
  return bench_df
55
 
56
 
57
- # def change_tab(query_param):
58
- # query_param = query_param.replace("'", '"')
59
- # query_param = json.loads(query_param)
60
-
61
- # if (
62
- # isinstance(query_param, dict)
63
- # and "tab" in query_param
64
- # and query_param["tab"] == "evaluation"
65
- # ):
66
- # return gr.Tabs.update(selected=1)
67
- # else:
68
- # return gr.Tabs.update(selected=0)
69
-
70
-
71
  def submit_query(text, backends, datatypes, threshold, raw_df):
72
 
73
  # extract the average score (float) from the clickable score (clickable markdown)
74
- # raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
75
- # extract_score_from_clickable)
76
  filtered_df = raw_df[
77
  raw_df["Model πŸ€—"].str.lower().str.contains(text.lower()) &
78
  raw_df["Backend 🏭"].isin(backends) &
79
  raw_df["Datatype πŸ“₯"].isin(datatypes) &
80
  (raw_df["Average H4 Score ⬆️"] >= threshold)
81
  ]
82
- # filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
83
- # make_clickable_score)
84
 
85
  return filtered_df
86
 
@@ -91,6 +76,7 @@ with demo:
91
  gr.HTML(TITLE)
92
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
93
 
 
94
  with gr.Row():
95
  search_bar = gr.Textbox(
96
  label="Model πŸ€—",
@@ -127,6 +113,7 @@ with demo:
127
  elem_id="submit-button",
128
  )
129
 
 
130
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
131
  with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ‹οΈ", elem_id="A100-benchmark", id=0):
132
  gr.HTML(SINGLE_A100_TEXT)
@@ -166,14 +153,6 @@ with demo:
166
  elem_id="citation-button",
167
  ).style(show_copy_button=True)
168
 
169
- # dummy = gr.Textbox(visible=False)
170
- # demo.load(
171
- # change_tab,
172
- # dummy,
173
- # tabs,
174
- # _js=get_window_url_params,
175
- # )
176
-
177
  # Restart space every hour
178
  scheduler = BackgroundScheduler()
179
  scheduler.add_job(restart_space, "interval", seconds=3600,
 
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
4
  from apscheduler.schedulers.background import BackgroundScheduler
 
20
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
21
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
22
  }
23
+ COLUMNS_DATATYPES = ["markdown", "str", "str", "markdown", "number", "number"]
24
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
25
 
26
 
 
38
  scores_df = pd.read_csv(
39
  f"./llm-perf-dataset/reports/average_scores.csv")
40
  bench_df = bench_df.merge(scores_df, on="model", how="left")
41
+ bench_df["average"] = bench_df["average"].apply(
42
+ make_clickable_score)
43
 
44
  # preprocess
45
  bench_df["model"] = bench_df["model"].apply(make_clickable_model)
 
53
  return bench_df
54
 
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  def submit_query(text, backends, datatypes, threshold, raw_df):
57
 
58
  # extract the average score (float) from the clickable score (clickable markdown)
59
+ raw_df["Average H4 Score ⬆️"] = raw_df["Average H4 Score ⬆️"].apply(
60
+ extract_score_from_clickable)
61
  filtered_df = raw_df[
62
  raw_df["Model πŸ€—"].str.lower().str.contains(text.lower()) &
63
  raw_df["Backend 🏭"].isin(backends) &
64
  raw_df["Datatype πŸ“₯"].isin(datatypes) &
65
  (raw_df["Average H4 Score ⬆️"] >= threshold)
66
  ]
67
+ filtered_df["Average H4 Score ⬆️"] = filtered_df["Average H4 Score ⬆️"].apply(
68
+ make_clickable_score)
69
 
70
  return filtered_df
71
 
 
76
  gr.HTML(TITLE)
77
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
78
 
79
+ # controls
80
  with gr.Row():
81
  search_bar = gr.Textbox(
82
  label="Model πŸ€—",
 
113
  elem_id="submit-button",
114
  )
115
 
116
+ # leaderboard tabs
117
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
118
  with gr.TabItem("πŸ–₯️ A100-80GB Benchmark πŸ‹οΈ", elem_id="A100-benchmark", id=0):
119
  gr.HTML(SINGLE_A100_TEXT)
 
153
  elem_id="citation-button",
154
  ).style(show_copy_button=True)
155
 
 
 
 
 
 
 
 
 
156
  # Restart space every hour
157
  scheduler = BackgroundScheduler()
158
  scheduler.add_job(restart_space, "interval", seconds=3600,
src/assets/css_html_js.py CHANGED
@@ -1,12 +1,4 @@
1
  custom_css = """
2
- #changelog-text {
3
- font-size: 16px !important;
4
- }
5
-
6
- #changelog-text h2 {
7
- font-size: 18px !important;
8
- }
9
-
10
  .markdown-text {
11
  font-size: 16px !important;
12
  }
@@ -28,26 +20,11 @@ custom_css = """
28
  transform: scale(1.3);
29
  }
30
 
31
- #leaderboard-table {
32
- margin-top: 15px
33
- }
34
-
35
- #leaderboard-table-lite {
36
- margin-top: 15px
37
- }
38
-
39
  #search-bar-table-box > div:first-child {
40
  background: none;
41
  border: none;
42
  }
43
 
44
-
45
- /* Hides the final AutoEvalColumn */
46
- #llm-benchmark-tab-table table td:last-child,
47
- #llm-benchmark-tab-table table th:last-child {
48
- display: none;
49
- }
50
-
51
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
52
  table td:first-child,
53
  table th:first-child {
@@ -59,19 +36,6 @@ table th:first-child {
59
  .tab-buttons button {
60
  font-size: 20px;
61
  }
62
-
63
- #scale-logo {
64
- border-style: none !important;
65
- box-shadow: none;
66
- display: block;
67
- margin-left: auto;
68
- margin-right: auto;
69
- max-width: 600px;
70
- }
71
-
72
- #scale-logo .download {
73
- display: none;
74
- }
75
  """
76
 
77
  get_window_url_params = """
 
1
  custom_css = """
 
 
 
 
 
 
 
 
2
  .markdown-text {
3
  font-size: 16px !important;
4
  }
 
20
  transform: scale(1.3);
21
  }
22
 
 
 
 
 
 
 
 
 
23
  #search-bar-table-box > div:first-child {
24
  background: none;
25
  border: none;
26
  }
27
 
 
 
 
 
 
 
 
28
  /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
29
  table td:first-child,
30
  table th:first-child {
 
36
  .tab-buttons button {
37
  font-size: 20px;
38
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  """
40
 
41
  get_window_url_params = """
src/assets/text_content.py CHANGED
@@ -1,22 +1,20 @@
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
  INTRODUCTION_TEXT = f"""
4
- The πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) on different hardwares and backends using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
5
- Anyone from the community can request a model or a hardware+backend configuration for automated benchmarking:
6
- - Model requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ once they're publicly available.
7
- - Hardware+Backend requests should be made in the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions).
8
 
9
- [Config files](https://github.com/huggingface/optimum-benchmark/blob/main/examples/bert.yaml) (which can be used with Optimum-Benchmark) will be available soon for reproduction, questioning and correction of our results.
 
 
10
  """
11
 
12
- SINGLE_A100_TEXT = """<h3>Single-GPU (1xA100):</h3>
13
  <ul>
14
  <li>Singleton Batch (1)</li>
15
  <li>Thousand Tokens (1000)</li>
16
  </ul>
17
  """
18
 
19
-
20
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
21
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
22
  author = {Ilyas Moutawwakil},
@@ -25,8 +23,8 @@ CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
25
  publisher = {Hugging Face},
26
  howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
27
  @software{optimum-benchmark,
28
- author = {Ilyas Moutawwakil},
29
  publisher = {Hugging Face},
30
- title = {A framework for benchmarking the performance of Transformers models on different hardwares and backends},
31
  }
32
  """
 
1
  TITLE = """<h1 align="center" id="space-title">πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ</h1>"""
2
 
3
  INTRODUCTION_TEXT = f"""
4
+ The πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ aims to benchmark the performance (latency & throughput) of Large Language Models (LLMs) with different hardwares, backends and optimizations using [Optimum-Benchmark](https://github.com/huggingface/optimum-benchmark) and [Optimum](https://github.com/huggingface/optimum) flavors.
 
 
 
5
 
6
+ Anyone from the community can request a model or a hardware+backend+optimization configuration for automated benchmarking:
7
+ - Model requests should be made in the [πŸ€— Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard) and will be added to the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ automatically once they're publicly available. That's mostly because we don't want to benchmark models that don't have an evaluation score yet.
8
+ - Hardware+Backend+Optimization requests should be made in the πŸ€— Open LLM-Perf Leaderboard πŸ‹οΈ [community discussions](https://huggingface.co/spaces/optimum/llm-perf-leaderboard/discussions) for open discussion about their relevance and feasibility.
9
  """
10
 
11
+ SINGLE_A100_TEXT = """<h3>Single-GPU Benchmarks (1xA100):</h3>
12
  <ul>
13
  <li>Singleton Batch (1)</li>
14
  <li>Thousand Tokens (1000)</li>
15
  </ul>
16
  """
17
 
 
18
  CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results."
19
  CITATION_BUTTON_TEXT = r"""@misc{open-llm-perf-leaderboard,
20
  author = {Ilyas Moutawwakil},
 
23
  publisher = {Hugging Face},
24
  howpublished = "\url{https://huggingface.co/spaces/optimum/llm-perf-leaderboard}",
25
  @software{optimum-benchmark,
26
+ author = {Ilyas Moutawwakil},
27
  publisher = {Hugging Face},
28
+ title = {Optimum-Benchmark: A framework for benchmarking the performance of Transformers models with different hardwares, backends and optimizations.},
29
  }
30
  """