BenchmarkBot commited on
Commit
223c247
β€’
1 Parent(s): ce83759

added llm perf score

Browse files
Files changed (1) hide show
  1. app.py +41 -31
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import gradio as gr
3
  import pandas as pd
4
  import plotly.express as px
@@ -16,8 +17,8 @@ from src.utils import (
16
  restart_space,
17
  load_dataset_repo,
18
  make_clickable_model,
19
- make_clickable_score,
20
- num_to_str,
21
  )
22
  from src.assets.css_html_js import custom_css, custom_js
23
 
@@ -32,12 +33,12 @@ COLUMNS_MAPPING = {
32
  "backend.torch_dtype": "Load Dtype πŸ“₯",
33
  "optimizations": "Optimizations πŸ› οΈ",
34
  #
 
 
35
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
 
36
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
37
- "score": "Average Open LLM Score ⬆️",
38
- #
39
- "composite_score": "Composite Score ⬆️",
40
- "num_params": "#️⃣ Parameters πŸ“",
41
  }
42
  COLUMNS_DATATYPES = [
43
  "markdown",
@@ -47,12 +48,12 @@ COLUMNS_DATATYPES = [
47
  #
48
  "number",
49
  "number",
50
- "markdown",
51
  #
52
  "number",
53
- "str",
 
54
  ]
55
- SORTING_COLUMN = ["Composite Score ⬆️"]
56
 
57
 
58
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
@@ -62,36 +63,46 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
62
  if llm_perf_dataset_repo:
63
  llm_perf_dataset_repo.git_pull()
64
 
65
- # load
66
  bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
67
  scores_df = pd.read_csv(f"./llm-perf-dataset/reports/open-llm-leaderboard.csv")
68
  bench_df = bench_df.merge(scores_df, on="model", how="left")
 
 
69
  bench_df = bench_df[bench_df["score"].notna()]
70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
  bench_df["optimizations"] = bench_df[
72
  ["backend.bettertransformer", "backend.load_in_8bit", "backend.load_in_4bit"]
73
  ].apply(
74
  lambda x: ", ".join(
75
- sum(
 
76
  [
77
- ["BetterTransformer"] if x[0] == True else [],
78
- ["LLM.int8"] if x[1] == True else [],
79
- ["LLM.fp4"] if x[2] == True else [],
80
  ],
81
- [],
82
- )
83
  )
84
- if any(x)
85
  else "None",
86
  axis=1,
87
  )
88
 
89
- # create composite score
90
- normalized_score = 100 - bench_df["score"]
91
- normalized_latency = bench_df["generate.latency(s)"]
92
- # normalized_memory = (bench_df["forward.peak_memory(MB)"].max()-bench_df["forward.peak_memory(MB)"])/(bench_df["forward.peak_memory(MB)"].max()-bench_df["forward.peak_memory(MB)"].min())
93
- bench_df["composite_score"] = normalized_score + normalized_latency
94
-
95
  return bench_df
96
 
97
 
@@ -102,13 +113,12 @@ def get_benchmark_table(bench_df):
102
  bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
103
  # sort
104
  bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
105
-
106
  # transform
107
  bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
108
- bench_df["#️⃣ Parameters πŸ“"] = bench_df["#️⃣ Parameters πŸ“"].apply(num_to_str)
109
- bench_df["Average Open LLM Score ⬆️"] = bench_df["Average Open LLM Score ⬆️"].apply(
110
- make_clickable_score
111
  )
 
112
  return bench_df
113
 
114
 
@@ -144,7 +154,7 @@ def get_benchmark_plot(bench_df):
144
  "yanchor": "top",
145
  },
146
  xaxis_title="Per 1000 Tokens Latency (s)",
147
- yaxis_title="Average Open LLM Score",
148
  legend_title="Model Type and Backend",
149
  width=1200,
150
  height=600,
@@ -159,8 +169,8 @@ def get_benchmark_plot(bench_df):
159
  "Optimizations: %{customdata[3]}",
160
  "Peak Memory (MB): %{customdata[4]}",
161
  "Throughput (tokens/s): %{customdata[5]}",
162
- "Average Open LLM Score: %{y}",
163
  "Per 1000 Tokens Latency (s): %{x}",
 
164
  ]
165
  )
166
  )
@@ -232,8 +242,8 @@ with demo:
232
  with gr.Column(scale=1):
233
  with gr.Box():
234
  score_slider = gr.Slider(
235
- label="Average Open LLM Score πŸ“ˆ",
236
- info="🎚️ Slide to minimum Average Open LLM score",
237
  value=0,
238
  elem_id="threshold-slider",
239
  )
 
1
  import os
2
+ import math
3
  import gradio as gr
4
  import pandas as pd
5
  import plotly.express as px
 
17
  restart_space,
18
  load_dataset_repo,
19
  make_clickable_model,
20
+ # make_clickable_score,
21
+ # num_to_str,
22
  )
23
  from src.assets.css_html_js import custom_css, custom_js
24
 
 
33
  "backend.torch_dtype": "Load Dtype πŸ“₯",
34
  "optimizations": "Optimizations πŸ› οΈ",
35
  #
36
+ "perf": "Open LLM-Perf Score ⬆️",
37
+ #
38
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
39
+ "score": "Open LLM Score ⬆️",
40
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
41
+ "num_params": "#️⃣ Parameters (M) πŸ“",
 
 
 
42
  }
43
  COLUMNS_DATATYPES = [
44
  "markdown",
 
48
  #
49
  "number",
50
  "number",
 
51
  #
52
  "number",
53
+ "number",
54
+ "number",
55
  ]
56
+ SORTING_COLUMN = ["Open LLM-Perf Score ⬆️"]
57
 
58
 
59
  llm_perf_dataset_repo = load_dataset_repo(LLM_PERF_DATASET_REPO, OPTIMUM_TOKEN)
 
63
  if llm_perf_dataset_repo:
64
  llm_perf_dataset_repo.git_pull()
65
 
66
+ # load and merge
67
  bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
68
  scores_df = pd.read_csv(f"./llm-perf-dataset/reports/open-llm-leaderboard.csv")
69
  bench_df = bench_df.merge(scores_df, on="model", how="left")
70
+
71
+ # filter out models with no score
72
  bench_df = bench_df[bench_df["score"].notna()]
73
 
74
+ # create composite score
75
+ score_distance = 100 - bench_df["score"]
76
+ latency_distance = bench_df["generate.latency(s)"]
77
+ bench_df["perf"] = 1 / math.sqrt(score_distance**2 + latency_distance**2)
78
+ # normalize between 0 and 100
79
+ bench_df["perf"] = (
80
+ (bench_df["perf"] - bench_df["perf"].min())
81
+ / (bench_df["perf"].max() - bench_df["perf"].min())
82
+ * 100
83
+ )
84
+ # round to 2 decimals
85
+ bench_df["perf"] = bench_df["perf"].round(2)
86
+
87
+ # add optimizations
88
  bench_df["optimizations"] = bench_df[
89
  ["backend.bettertransformer", "backend.load_in_8bit", "backend.load_in_4bit"]
90
  ].apply(
91
  lambda x: ", ".join(
92
+ filter(
93
+ lambda x: x != "",
94
  [
95
+ "BetterTransformer" if x[0] == True else "",
96
+ "LLM.int8" if x[1] == True else "",
97
+ "LLM.fp4" if x[2] == True else "",
98
  ],
99
+ ),
 
100
  )
101
+ if any([x[0] == True, x[1] == True, x[2] == True])
102
  else "None",
103
  axis=1,
104
  )
105
 
 
 
 
 
 
 
106
  return bench_df
107
 
108
 
 
113
  bench_df.rename(columns=COLUMNS_MAPPING, inplace=True)
114
  # sort
115
  bench_df.sort_values(by=SORTING_COLUMN, ascending=True, inplace=True)
 
116
  # transform
117
  bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
118
+ bench_df["#️⃣ Parameters (M) πŸ“"] = bench_df["#️⃣ Parameters πŸ“"].apply(
119
+ lambda x: int(x / (1024 * 1024))
 
120
  )
121
+
122
  return bench_df
123
 
124
 
 
154
  "yanchor": "top",
155
  },
156
  xaxis_title="Per 1000 Tokens Latency (s)",
157
+ yaxis_title="Open LLM Score",
158
  legend_title="Model Type and Backend",
159
  width=1200,
160
  height=600,
 
169
  "Optimizations: %{customdata[3]}",
170
  "Peak Memory (MB): %{customdata[4]}",
171
  "Throughput (tokens/s): %{customdata[5]}",
 
172
  "Per 1000 Tokens Latency (s): %{x}",
173
+ "Open LLM Score: %{y}",
174
  ]
175
  )
176
  )
 
242
  with gr.Column(scale=1):
243
  with gr.Box():
244
  score_slider = gr.Slider(
245
+ label="Open LLM Score πŸ“ˆ",
246
+ info="🎚️ Slide to minimum Open LLM score",
247
  value=0,
248
  elem_id="threshold-slider",
249
  )