BenchmarkBot commited on
Commit
0f1bf97
β€’
1 Parent(s): 5f0b430

added bettertransformer and LLM.int8

Browse files
Files changed (1) hide show
  1. app.py +77 -46
app.py CHANGED
@@ -4,8 +4,20 @@ import pandas as pd
4
  import plotly.express as px
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
- from src.assets.text_content import TITLE, INTRODUCTION_TEXT, SINGLE_A100_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT
8
- from src.utils import restart_space, load_dataset_repo, make_clickable_model, make_clickable_score, num_to_str
 
 
 
 
 
 
 
 
 
 
 
 
9
  from src.assets.css_html_js import custom_css
10
 
11
 
@@ -17,13 +29,28 @@ COLUMNS_MAPPING = {
17
  "model": "Model πŸ€—",
18
  "backend.name": "Backend 🏭",
19
  "backend.torch_dtype": "Load Dtype πŸ“₯",
20
- "num_parameters": "#Parameters πŸ“",
 
21
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
22
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
23
  "average": "Average Open LLM Score ⬆️",
 
 
 
24
  }
25
- COLUMNS_DATATYPES = ["markdown", "str", "str",
26
- "number", "number", "number", "markdown"]
 
 
 
 
 
 
 
 
 
 
 
27
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
28
 
29
 
@@ -35,17 +62,14 @@ def get_benchmark_df(benchmark="1xA100-80GB"):
35
  llm_perf_dataset_repo.git_pull()
36
 
37
  # load
38
- bench_df = pd.read_csv(
39
- f"./llm-perf-dataset/reports/{benchmark}.csv")
40
- scores_df = pd.read_csv(
41
- f"./llm-perf-dataset/reports/additional_data.csv")
42
  bench_df = bench_df.merge(scores_df, on="model", how="left")
43
 
44
  return bench_df
45
 
46
 
47
  def get_benchmark_table(bench_df):
48
-
49
  # filter
50
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
51
  # rename
@@ -54,73 +78,76 @@ def get_benchmark_table(bench_df):
54
  bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
55
  # transform
56
  bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
 
57
  bench_df["Average Open LLM Score ⬆️"] = bench_df["Average Open LLM Score ⬆️"].apply(
58
- make_clickable_score)
59
- bench_df["#Parameters πŸ“"] = bench_df["#Parameters πŸ“"].apply(num_to_str)
60
-
61
  return bench_df
62
 
63
 
64
  def get_benchmark_plot(bench_df):
65
-
66
  # untill falcon gets fixed / natively supported
67
  bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
68
 
69
  fig = px.scatter(
70
- bench_df, x="generate.latency(s)", y="average",
71
- color='model_type', symbol='backend.name', size='forward.peak_memory(MB)',
72
- custom_data=['model', 'backend.name', 'backend.torch_dtype',
73
- 'forward.peak_memory(MB)', 'generate.throughput(tokens/s)'],
74
- symbol_sequence=['triangle-up', 'circle'],
 
 
 
 
 
 
 
 
 
75
  # as many distinct colors as there are model_type,backend.name couples
76
  color_discrete_sequence=px.colors.qualitative.Light24,
77
  )
78
 
79
  fig.update_layout(
80
  title={
81
- 'text': "Model Score vs. Latency vs. Memory",
82
- 'y': 0.95, 'x': 0.5,
83
- 'xanchor': 'center',
84
- 'yanchor': 'top'
 
85
  },
86
  xaxis_title="Per 1000 Tokens Latency (s)",
87
  yaxis_title="Average Open LLM Score",
88
  legend_title="Model Type and Backend",
89
  width=1200,
90
  height=600,
91
- # legend=dict(
92
- # orientation="h",
93
- # yanchor="bottom",
94
- # y=-0.35,
95
- # xanchor="center",
96
- # x=0.5
97
- # )
98
  )
99
 
100
  fig.update_traces(
101
- hovertemplate="<br>".join([
102
- "Model: %{customdata[0]}",
103
- "Backend: %{customdata[1]}",
104
- "Datatype: %{customdata[2]}",
105
- "Peak Memory (MB): %{customdata[3]}",
106
- "Throughput (tokens/s): %{customdata[4]}",
107
- "Average Open LLM Score: %{y}",
108
- "Per 1000 Tokens Latency (s): %{x}",
109
- ])
 
 
110
  )
111
 
112
  return fig
113
 
114
 
115
  def filter_query(text, backends, datatypes, threshold, benchmark="1xA100-80GB"):
116
-
117
  raw_df = get_benchmark_df(benchmark=benchmark)
118
 
119
  filtered_df = raw_df[
120
- raw_df["model"].str.lower().str.contains(text.lower()) &
121
- raw_df["backend.name"].isin(backends) &
122
- raw_df["backend.torch_dtype"].isin(datatypes) &
123
- (raw_df["average"] >= threshold)
124
  ]
125
 
126
  filtered_table = get_benchmark_table(filtered_df)
@@ -221,8 +248,12 @@ with demo:
221
 
222
  # Restart space every hour
223
  scheduler = BackgroundScheduler()
224
- scheduler.add_job(restart_space, "interval", seconds=3600,
225
- args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN])
 
 
 
 
226
  scheduler.start()
227
 
228
  # Launch demo
 
4
  import plotly.express as px
5
  from apscheduler.schedulers.background import BackgroundScheduler
6
 
7
+ from src.assets.text_content import (
8
+ TITLE,
9
+ INTRODUCTION_TEXT,
10
+ SINGLE_A100_TEXT,
11
+ CITATION_BUTTON_LABEL,
12
+ CITATION_BUTTON_TEXT,
13
+ )
14
+ from src.utils import (
15
+ restart_space,
16
+ load_dataset_repo,
17
+ make_clickable_model,
18
+ make_clickable_score,
19
+ num_to_str,
20
+ )
21
  from src.assets.css_html_js import custom_css
22
 
23
 
 
29
  "model": "Model πŸ€—",
30
  "backend.name": "Backend 🏭",
31
  "backend.torch_dtype": "Load Dtype πŸ“₯",
32
+ "num_parameters": "#️⃣ Parameters πŸ“",
33
+ #
34
  "forward.peak_memory(MB)": "Peak Memory (MB) ⬇️",
35
  "generate.throughput(tokens/s)": "Throughput (tokens/s) ⬆️",
36
  "average": "Average Open LLM Score ⬆️",
37
+ #
38
+ "backend.bettertransformer": "BetterTransformer πŸ€–",
39
+ "backend.load_in_8bit": "LLM.int8 πŸ—œοΈ",
40
  }
41
+ COLUMNS_DATATYPES = [
42
+ "markdown",
43
+ "str",
44
+ "str",
45
+ "str",
46
+ #
47
+ "number",
48
+ "number",
49
+ "markdown",
50
+ #
51
+ "str",
52
+ "str",
53
+ ]
54
  SORTING_COLUMN = ["Throughput (tokens/s) ⬆️"]
55
 
56
 
 
62
  llm_perf_dataset_repo.git_pull()
63
 
64
  # load
65
+ bench_df = pd.read_csv(f"./llm-perf-dataset/reports/{benchmark}.csv")
66
+ scores_df = pd.read_csv(f"./llm-perf-dataset/reports/additional_data.csv")
 
 
67
  bench_df = bench_df.merge(scores_df, on="model", how="left")
68
 
69
  return bench_df
70
 
71
 
72
  def get_benchmark_table(bench_df):
 
73
  # filter
74
  bench_df = bench_df[list(COLUMNS_MAPPING.keys())]
75
  # rename
 
78
  bench_df.sort_values(by=SORTING_COLUMN, ascending=False, inplace=True)
79
  # transform
80
  bench_df["Model πŸ€—"] = bench_df["Model πŸ€—"].apply(make_clickable_model)
81
+ bench_df["#️⃣ Parameters πŸ“"] = bench_df["#️⃣ Parameters πŸ“"].apply(num_to_str)
82
  bench_df["Average Open LLM Score ⬆️"] = bench_df["Average Open LLM Score ⬆️"].apply(
83
+ make_clickable_score
84
+ )
 
85
  return bench_df
86
 
87
 
88
  def get_benchmark_plot(bench_df):
 
89
  # untill falcon gets fixed / natively supported
90
  bench_df = bench_df[bench_df["generate.latency(s)"] < 100]
91
 
92
  fig = px.scatter(
93
+ bench_df,
94
+ x="generate.latency(s)",
95
+ y="average",
96
+ color="model_type",
97
+ symbol="backend.name",
98
+ size="forward.peak_memory(MB)",
99
+ custom_data=[
100
+ "model",
101
+ "backend.name",
102
+ "backend.torch_dtype",
103
+ "forward.peak_memory(MB)",
104
+ "generate.throughput(tokens/s)",
105
+ ],
106
+ symbol_sequence=["triangle-up", "circle"],
107
  # as many distinct colors as there are model_type,backend.name couples
108
  color_discrete_sequence=px.colors.qualitative.Light24,
109
  )
110
 
111
  fig.update_layout(
112
  title={
113
+ "text": "Model Score vs. Latency vs. Memory",
114
+ "y": 0.95,
115
+ "x": 0.5,
116
+ "xanchor": "center",
117
+ "yanchor": "top",
118
  },
119
  xaxis_title="Per 1000 Tokens Latency (s)",
120
  yaxis_title="Average Open LLM Score",
121
  legend_title="Model Type and Backend",
122
  width=1200,
123
  height=600,
 
 
 
 
 
 
 
124
  )
125
 
126
  fig.update_traces(
127
+ hovertemplate="<br>".join(
128
+ [
129
+ "Model: %{customdata[0]}",
130
+ "Backend: %{customdata[1]}",
131
+ "Datatype: %{customdata[2]}",
132
+ "Peak Memory (MB): %{customdata[3]}",
133
+ "Throughput (tokens/s): %{customdata[4]}",
134
+ "Average Open LLM Score: %{y}",
135
+ "Per 1000 Tokens Latency (s): %{x}",
136
+ ]
137
+ )
138
  )
139
 
140
  return fig
141
 
142
 
143
  def filter_query(text, backends, datatypes, threshold, benchmark="1xA100-80GB"):
 
144
  raw_df = get_benchmark_df(benchmark=benchmark)
145
 
146
  filtered_df = raw_df[
147
+ raw_df["model"].str.lower().str.contains(text.lower())
148
+ & raw_df["backend.name"].isin(backends)
149
+ & raw_df["backend.torch_dtype"].isin(datatypes)
150
+ & (raw_df["average"] >= threshold)
151
  ]
152
 
153
  filtered_table = get_benchmark_table(filtered_df)
 
248
 
249
  # Restart space every hour
250
  scheduler = BackgroundScheduler()
251
+ scheduler.add_job(
252
+ restart_space,
253
+ "interval",
254
+ seconds=3600,
255
+ args=[LLM_PERF_LEADERBOARD_REPO, OPTIMUM_TOKEN],
256
+ )
257
  scheduler.start()
258
 
259
  # Launch demo