IlyasMoutawwakil HF staff commited on
Commit
3d7033f
1 Parent(s): 16a8bbd
Files changed (3) hide show
  1. app.py +38 -61
  2. script.py +14 -0
  3. src/utils.py +10 -5
app.py CHANGED
@@ -21,33 +21,27 @@ HF_TOKEN = os.environ.get("HF_TOKEN", None)
21
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
22
  MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
23
  ALL_COLUMNS_MAPPING = {
24
- # model
25
  "Model": "Model 🤗",
26
  "Arch": "Arch 🏛️",
27
- "Size": "Size 📏",
28
  # deployment settings
29
  "backend.name": "Backend 🏭",
30
  "backend.torch_dtype": "Dtype 📥",
31
- "optimizations": "Optimizations 🛠️",
32
  "quantization": "Quantization 🗜️",
33
- # quality measurements
34
- "Score": "Avg Score (%) ⬆️",
35
- # throughput measurements
36
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
37
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
38
- # latency measurements
39
  "forward.latency(s)": "Prefill Latency (s) ⬇️",
40
  "generate.latency(s)": "E2E Latency (s) ⬇️",
41
- # memory measurements
42
  "generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
43
  "generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
44
  "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
45
- # energy measurements
46
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
47
  }
48
  SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
49
  SORTING_ASCENDING = [False, True]
50
-
51
  ALL_COLUMNS_DATATYPES = [
52
  # open llm
53
  "markdown",
@@ -70,17 +64,18 @@ ALL_COLUMNS_DATATYPES = [
70
  "number",
71
  "number",
72
  ]
 
 
 
 
 
 
 
 
 
73
 
74
-
75
- def get_benchmark_df(machine="hf-dgx-01"):
76
- # download data
77
- hf_hub_download(
78
- repo_id="optimum/llm-perf-dataset",
79
- filename="open-llm.csv",
80
- local_dir="dataset",
81
- repo_type="dataset",
82
- token=HF_TOKEN,
83
- )
84
  hf_hub_download(
85
  repo_id="optimum/llm-perf-dataset",
86
  filename=f"{machine}/full-report.csv",
@@ -88,11 +83,13 @@ def get_benchmark_df(machine="hf-dgx-01"):
88
  repo_type="dataset",
89
  token=HF_TOKEN,
90
  )
91
- open_llm = pd.read_csv("dataset/open-llm.csv")
92
- full_report = pd.read_csv(f"dataset/{machine}/full-report.csv")
93
 
 
94
  # merge on model
95
- merged_df = open_llm.merge(full_report, left_on="Model", right_on="model")
 
96
  # transpose energy consumption
97
  merged_df["generate.energy_consumption(tokens/kWh)"] = (
98
  1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
@@ -102,8 +99,8 @@ def get_benchmark_df(machine="hf-dgx-01"):
102
  merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
103
  "generate.energy_consumption(tokens/kWh)",
104
  ] = pd.NA
105
- # add optimizations column
106
- merged_df["optimizations"] = merged_df[
107
  ["backend.to_bettertransformer", "backend.use_flash_attention_2"]
108
  ].apply(
109
  lambda x: "BetterTransformer"
@@ -135,10 +132,10 @@ def get_benchmark_table(bench_df):
135
  copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
136
  copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
137
  # process quantization
138
- copy_df["Avg Score (%) ⬆️"] = copy_df.apply(
139
- lambda x: f"{x['Avg Score (%) ⬆️']}**"
140
  if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
141
- else x["Avg Score (%) ⬆️"],
142
  axis=1,
143
  )
144
  return copy_df
@@ -151,7 +148,7 @@ def get_benchmark_chart(bench_df):
151
  # plot
152
  fig = px.scatter(
153
  copy_df,
154
- y="Avg Score (%) ⬆️",
155
  x="E2E Latency (s) ⬇️",
156
  size="Allocated Memory (MB) ⬇️",
157
  color="Arch 🏛️",
@@ -167,7 +164,7 @@ def get_benchmark_chart(bench_df):
167
  "yanchor": "top",
168
  },
169
  xaxis_title="Per 1000 Tokens Latency (s)",
170
- yaxis_title="Avg Open LLM Score (%)",
171
  legend_title="LLM Architecture",
172
  width=1200,
173
  height=600,
@@ -188,7 +185,7 @@ def filter_query(
188
  backends,
189
  datatypes,
190
  optimizations,
191
- quantization_scheme,
192
  score,
193
  memory,
194
  machine,
@@ -198,29 +195,9 @@ def filter_query(
198
  raw_df["Model 🤗"].str.contains(text, case=False)
199
  & raw_df["Backend ����"].isin(backends)
200
  & raw_df["Dtype 📥"].isin(datatypes)
201
- & (
202
- pd.concat(
203
- [
204
- raw_df["Optimizations 🛠️"].str.contains(optimization, case=False)
205
- for optimization in optimizations
206
- ],
207
- axis=1,
208
- ).any(axis="columns")
209
- if len(optimizations) > 0
210
- else True
211
- )
212
- & (
213
- pd.concat(
214
- [
215
- raw_df["Quantization 🗜️"].str.contains(quantization, case=False)
216
- for quantization in quantization_scheme
217
- ],
218
- axis=1,
219
- ).any(axis="columns")
220
- if len(quantization_scheme) > 0
221
- else True
222
- )
223
- & (raw_df["Avg Score (%) ⬆️"] >= score)
224
  & (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
225
  ]
226
  filtered_table = get_benchmark_table(filtered_df)
@@ -289,7 +266,7 @@ with demo:
289
  with gr.Row():
290
  with gr.Column(scale=1):
291
  score_slider = gr.Slider(
292
- label="Open LLM Score 📈",
293
  info="🎚️ Slide to minimum Open LLM score",
294
  value=0,
295
  elem_id="threshold-slider",
@@ -321,12 +298,12 @@ with demo:
321
  elem_id="dtype-checkboxes",
322
  )
323
  with gr.Column(scale=1):
324
- optimizations_checkboxes = gr.CheckboxGroup(
325
  label="Optimizations 🛠️",
326
- choices=["None", "BetterTransformer"],
327
- value=["None", "BetterTransformer"],
328
- info="☑️ Select the optimizations",
329
- elem_id="optimizations-checkboxes",
330
  )
331
  with gr.Column(scale=1):
332
  quantization_checkboxes = gr.CheckboxGroup(
@@ -348,7 +325,7 @@ with demo:
348
  search_bar,
349
  backend_checkboxes,
350
  datatype_checkboxes,
351
- optimizations_checkboxes,
352
  quantization_checkboxes,
353
  score_slider,
354
  memory_slider,
 
21
  LLM_PERF_DATASET_REPO = "optimum/llm-perf-dataset"
22
  MACHINE_TO_HARDWARE = {"hf-dgx-01": "A100-80GB 🖥️"}
23
  ALL_COLUMNS_MAPPING = {
 
24
  "Model": "Model 🤗",
25
  "Arch": "Arch 🏛️",
26
+ "Size": "Params (B) 📏",
27
  # deployment settings
28
  "backend.name": "Backend 🏭",
29
  "backend.torch_dtype": "Dtype 📥",
30
+ "optimization": "Optimization 🛠️",
31
  "quantization": "Quantization 🗜️",
32
+ # measurements
33
+ "Score": "Open LLM Score (%) ⬆️",
 
34
  "decode.throughput(tokens/s)": "Decode Throughput (tokens/s) ⬆️",
35
  "generate.throughput(tokens/s)": "E2E Throughput (tokens/s) ⬆️",
 
36
  "forward.latency(s)": "Prefill Latency (s) ⬇️",
37
  "generate.latency(s)": "E2E Latency (s) ⬇️",
 
38
  "generate.max_memory_allocated(MB)": "Allocated Memory (MB) ⬇️",
39
  "generate.max_memory_reserved(MB)": "Reserved Memory (MB) ⬇️",
40
  "generate.max_memory_used(MB)": "Used Memory (MB) ⬇️",
 
41
  "generate.energy_consumption(tokens/kWh)": "Energy (tokens/kWh) ⬇️",
42
  }
43
  SORTING_COLUMN = ["Score", "generate.throughput(tokens/s)"]
44
  SORTING_ASCENDING = [False, True]
 
45
  ALL_COLUMNS_DATATYPES = [
46
  # open llm
47
  "markdown",
 
64
  "number",
65
  "number",
66
  ]
67
+ # download data
68
+ hf_hub_download(
69
+ repo_id="optimum/llm-perf-dataset",
70
+ filename="open-llm.csv",
71
+ local_dir="dataset",
72
+ repo_type="dataset",
73
+ token=HF_TOKEN,
74
+ )
75
+ OPEN_LLM = pd.read_csv("dataset/open-llm.csv")
76
 
77
+ MACHINE_TO_DATAFRAME = {}
78
+ for machine in MACHINE_TO_HARDWARE:
 
 
 
 
 
 
 
 
79
  hf_hub_download(
80
  repo_id="optimum/llm-perf-dataset",
81
  filename=f"{machine}/full-report.csv",
 
83
  repo_type="dataset",
84
  token=HF_TOKEN,
85
  )
86
+ MACHINE_TO_DATAFRAME[machine] = pd.read_csv(f"dataset/{machine}/full-report.csv")
87
+
88
 
89
+ def get_benchmark_df(machine="hf-dgx-01"):
90
  # merge on model
91
+ llm_perf = MACHINE_TO_DATAFRAME[machine].copy()
92
+ merged_df = OPEN_LLM.merge(llm_perf, left_on="Model", right_on="model")
93
  # transpose energy consumption
94
  merged_df["generate.energy_consumption(tokens/kWh)"] = (
95
  1 / merged_df["generate.energy_consumption(kWh/token)"].fillna(1)
 
99
  merged_df["generate.energy_consumption(tokens/kWh)"] == 1,
100
  "generate.energy_consumption(tokens/kWh)",
101
  ] = pd.NA
102
+ # add optimization column
103
+ merged_df["optimization"] = merged_df[
104
  ["backend.to_bettertransformer", "backend.use_flash_attention_2"]
105
  ].apply(
106
  lambda x: "BetterTransformer"
 
132
  copy_df["Model 🤗"] = copy_df["Model 🤗"].apply(process_model_name)
133
  copy_df["Arch 🏛️"] = copy_df["Arch 🏛️"].apply(process_model_arch)
134
  # process quantization
135
+ copy_df["Open LLM Score (%) ⬆️"] = copy_df.apply(
136
+ lambda x: f"{x['Open LLM Score (%) ⬆️']}**"
137
  if x["Quantization 🗜️"] in ["BnB.4bit", "GPTQ.4bit"]
138
+ else x["Open LLM Score (%) ⬆️"],
139
  axis=1,
140
  )
141
  return copy_df
 
148
  # plot
149
  fig = px.scatter(
150
  copy_df,
151
+ y="Open LLM Score (%) ⬆️",
152
  x="E2E Latency (s) ⬇️",
153
  size="Allocated Memory (MB) ⬇️",
154
  color="Arch 🏛️",
 
164
  "yanchor": "top",
165
  },
166
  xaxis_title="Per 1000 Tokens Latency (s)",
167
+ yaxis_title="Open LLM Score (%)",
168
  legend_title="LLM Architecture",
169
  width=1200,
170
  height=600,
 
185
  backends,
186
  datatypes,
187
  optimizations,
188
+ quantizations,
189
  score,
190
  memory,
191
  machine,
 
195
  raw_df["Model 🤗"].str.contains(text, case=False)
196
  & raw_df["Backend ����"].isin(backends)
197
  & raw_df["Dtype 📥"].isin(datatypes)
198
+ & raw_df["Optimization 🛠️"].isin(optimizations)
199
+ & raw_df["Quantization 🗜️"].isin(quantizations)
200
+ & (raw_df["Open LLM Score (%) ⬆️"] >= score)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  & (raw_df["Allocated Memory (MB) ⬇️"] <= memory)
202
  ]
203
  filtered_table = get_benchmark_table(filtered_df)
 
266
  with gr.Row():
267
  with gr.Column(scale=1):
268
  score_slider = gr.Slider(
269
+ label="Open LLM Score (%) 📈",
270
  info="🎚️ Slide to minimum Open LLM score",
271
  value=0,
272
  elem_id="threshold-slider",
 
298
  elem_id="dtype-checkboxes",
299
  )
300
  with gr.Column(scale=1):
301
+ optimization_checkboxes = gr.CheckboxGroup(
302
  label="Optimizations 🛠️",
303
+ choices=["None", "BetterTransformer", "FlashAttentionV2"],
304
+ value=["None", "BetterTransformer", "FlashAttentionV2"],
305
+ info="☑️ Select the optimization",
306
+ elem_id="optimization-checkboxes",
307
  )
308
  with gr.Column(scale=1):
309
  quantization_checkboxes = gr.CheckboxGroup(
 
325
  search_bar,
326
  backend_checkboxes,
327
  datatype_checkboxes,
328
+ optimization_checkboxes,
329
  quantization_checkboxes,
330
  score_slider,
331
  memory_slider,
script.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import hf_hub_download
2
+ import pandas as pd
3
+
4
+
5
+ hf_hub_download(
6
+ repo_id="optimum/llm-perf-dataset",
7
+ filename="open-llm.csv",
8
+ local_dir="dataset",
9
+ repo_type="dataset",
10
+ )
11
+
12
+ open_llm = pd.read_csv("dataset/open-llm.csv")
13
+ print(open_llm["Arch"].unique())
14
+ print(open_llm[open_llm["Arch"] == "rwkv"]["Model"].unique())
src/utils.py CHANGED
@@ -18,26 +18,31 @@ def change_tab(query_param):
18
 
19
 
20
  LLM_MODEL_ARCHS = {
21
- # branded ?
22
- "gpt_bigcode": "GPT-BigCode 🌸",
23
  "RefinedWebModel": "Falcon 🦅",
 
24
  "RefinedWeb": "Falcon 🦅",
25
  "baichuan": "Baichuan 🌊",
 
 
 
26
  "bloom": "Bloom 🌸",
27
  "llama": "LLaMA 🦙",
28
- # unbranded ? suggest something
 
 
 
29
  "stablelm_alpha": "StableLM-Alpha",
30
  "gpt_neox": "GPT-NeoX",
31
  "gpt_neo": "GPT-Neo",
32
- "codegen": "CodeGen",
33
  "chatglm": "ChatGLM",
 
34
  "gpt2": "GPT-2",
35
  "gptj": "GPT-J",
36
  "xglm": "XGLM",
37
  "rwkv": "RWKV",
38
  "bart": "BART",
39
  "opt": "OPT",
40
- "mpt": "MPT",
41
  }
42
 
43
 
 
18
 
19
 
20
  LLM_MODEL_ARCHS = {
21
+ "mixformer-sequential": "Phi φ",
 
22
  "RefinedWebModel": "Falcon 🦅",
23
+ "gpt_bigcode": "StarCoder ⭐",
24
  "RefinedWeb": "Falcon 🦅",
25
  "baichuan": "Baichuan 🌊",
26
+ "mistral": "Mistral Ⓜ️",
27
+ "codegen": "CodeGen ♾️",
28
+ "falcon": "Falcon 🦅",
29
  "bloom": "Bloom 🌸",
30
  "llama": "LLaMA 🦙",
31
+ "mpt": "MPT 🧱",
32
+ "Yi": "Yi 人",
33
+ # suggest something
34
+ "stablelm_epoch": "StableLM-Epoch",
35
  "stablelm_alpha": "StableLM-Alpha",
36
  "gpt_neox": "GPT-NeoX",
37
  "gpt_neo": "GPT-Neo",
 
38
  "chatglm": "ChatGLM",
39
+ "internlm": "InternLM",
40
  "gpt2": "GPT-2",
41
  "gptj": "GPT-J",
42
  "xglm": "XGLM",
43
  "rwkv": "RWKV",
44
  "bart": "BART",
45
  "opt": "OPT",
 
46
  }
47
 
48