IlyasMoutawwakil HF staff commited on
Commit
5345cba
Β·
1 Parent(s): 7ecfa5a
app.py CHANGED
@@ -4,14 +4,14 @@ from src.assets import custom_css
4
 
5
  # from src.attention import create_attn_plots
6
  from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
7
- from src.control_panel import (
 
 
 
8
  create_control_callback,
9
  create_control_panel,
10
  create_select_callback,
11
  )
12
- from src.leaderboard import create_leaderboard_table
13
- from src.llm_perf import get_llm_perf_df
14
- from src.map import create_lat_score_mem_plot
15
 
16
  # from custom_kernels import create_quant_krnl_plots
17
 
 
4
 
5
  # from src.attention import create_attn_plots
6
  from src.content import ABOUT, CITATION_BUTTON, CITATION_BUTTON_LABEL, LOGO, TITLE
7
+ from src.leaderboard import create_leaderboard_table
8
+ from src.llm_perf import get_llm_perf_df
9
+ from src.map import create_lat_score_mem_plot
10
+ from src.panel import (
11
  create_control_callback,
12
  create_control_panel,
13
  create_select_callback,
14
  )
 
 
 
15
 
16
  # from custom_kernels import create_quant_krnl_plots
17
 
src/kernels.py CHANGED
@@ -32,7 +32,7 @@ def get_quant_df(llm_perf_df):
32
  vanilla_df = copy_df[
33
  (copy_df["Backend 🏭"] == "pytorch")
34
  & (copy_df["DType πŸ“₯"] == "float16")
35
- & (copy_df["Quantization πŸ—œοΈ"] == "None")
36
  ]
37
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
 
32
  vanilla_df = copy_df[
33
  (copy_df["Backend 🏭"] == "pytorch")
34
  & (copy_df["DType πŸ“₯"] == "float16")
35
+ & (copy_df["Quantization πŸ—œοΈ"] == "Unquantized")
36
  ]
37
  exllamav1_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV1")]
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
src/leaderboard.py CHANGED
@@ -48,8 +48,7 @@ def get_leaderboard_df(llm_perf_df):
48
  df["Model πŸ€—"] = df["Model πŸ€—"].apply(process_model)
49
  # process quantization for leaderboard
50
  df["Open LLM Score (%)"] = df.apply(
51
- lambda x: process_score(x["Open LLM Score (%)"], x["Quantization πŸ—œοΈ"]),
52
- axis=1,
53
  )
54
  return df
55
 
 
48
  df["Model πŸ€—"] = df["Model πŸ€—"].apply(process_model)
49
  # process quantization for leaderboard
50
  df["Open LLM Score (%)"] = df.apply(
51
+ lambda x: process_score(x["Open LLM Score (%)"], x["Quantization πŸ—œοΈ"]), axis=1
 
52
  )
53
  return df
54
 
src/llm_perf.py CHANGED
@@ -61,6 +61,7 @@ def processed_llm_perf_df(llm_perf_df):
61
  assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
62
  assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
63
  # fix couple stuff
 
64
  llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
65
  "flash_attention_2", "fa2"
66
  )
 
61
  assert llm_perf_df["config.scenario.generate_kwargs.max_new_tokens"].nunique() == 1
62
  assert llm_perf_df["config.scenario.generate_kwargs.min_new_tokens"].nunique() == 1
63
  # fix couple stuff
64
+ llm_perf_df.dropna(subset=["report.decode.latency.p50"], inplace=True)
65
  llm_perf_df["config.name"] = llm_perf_df["config.name"].str.replace(
66
  "flash_attention_2", "fa2"
67
  )
src/map.py CHANGED
@@ -21,11 +21,12 @@ SCORE_MEMORY_LATENCY_DATA = [
21
  def get_lat_score_mem_fig(llm_perf_df):
22
  copy_df = llm_perf_df.copy()
23
  # plot
 
24
  fig = px.scatter(
25
  copy_df,
 
26
  x="End-to-End (s)",
27
  y="Open LLM Score (%)",
28
- size="Memory (MB)",
29
  color="Architecture πŸ›οΈ",
30
  custom_data=SCORE_MEMORY_LATENCY_DATA,
31
  color_discrete_sequence=px.colors.qualitative.Light24,
@@ -41,10 +42,10 @@ def get_lat_score_mem_fig(llm_perf_df):
41
  fig.update_layout(
42
  title={
43
  "text": "Latency vs. Score vs. Memory",
44
- "y": 0.95,
45
- "x": 0.5,
46
  "xanchor": "center",
47
  "yanchor": "top",
 
 
48
  },
49
  xaxis_title="Time To Generate 64 Tokens (s)",
50
  yaxis_title="Open LLM Score (%)",
@@ -52,6 +53,8 @@ def get_lat_score_mem_fig(llm_perf_df):
52
  width=1200,
53
  height=600,
54
  )
 
 
55
 
56
  return fig
57
 
 
21
  def get_lat_score_mem_fig(llm_perf_df):
22
  copy_df = llm_perf_df.copy()
23
  # plot
24
+ # filter nan memory
25
  fig = px.scatter(
26
  copy_df,
27
+ size="Memory (MB)",
28
  x="End-to-End (s)",
29
  y="Open LLM Score (%)",
 
30
  color="Architecture πŸ›οΈ",
31
  custom_data=SCORE_MEMORY_LATENCY_DATA,
32
  color_discrete_sequence=px.colors.qualitative.Light24,
 
42
  fig.update_layout(
43
  title={
44
  "text": "Latency vs. Score vs. Memory",
 
 
45
  "xanchor": "center",
46
  "yanchor": "top",
47
+ "y": 0.95,
48
+ "x": 0.5,
49
  },
50
  xaxis_title="Time To Generate 64 Tokens (s)",
51
  yaxis_title="Open LLM Score (%)",
 
53
  width=1200,
54
  height=600,
55
  )
56
+ # update x range with 95 percentile of
57
+ fig.update_xaxes(range=[0, copy_df["End-to-End (s)"].quantile(0.95)])
58
 
59
  return fig
60
 
src/{control_panel.py β†’ panel.py} RENAMED
@@ -58,14 +58,14 @@ def create_control_panel(machine: str):
58
  quantization_checkboxes = gr.CheckboxGroup(
59
  label="Quantizations πŸ—œοΈ",
60
  choices=[
61
- "None",
62
  "BnB.4bit",
63
  "BnB.8bit",
64
  "AWQ.4bit",
65
  "GPTQ.4bit",
66
  ],
67
  value=[
68
- "None",
69
  "BnB.4bit",
70
  "BnB.8bit",
71
  "AWQ.4bit",
 
58
  quantization_checkboxes = gr.CheckboxGroup(
59
  label="Quantizations πŸ—œοΈ",
60
  choices=[
61
+ "Unquantized",
62
  "BnB.4bit",
63
  "BnB.8bit",
64
  "AWQ.4bit",
65
  "GPTQ.4bit",
66
  ],
67
  value=[
68
+ "Unquantized",
69
  "BnB.4bit",
70
  "BnB.8bit",
71
  "AWQ.4bit",
src/utils.py CHANGED
@@ -43,7 +43,7 @@ def process_architectures(model):
43
 
44
 
45
  def process_score(score, quantization):
46
- if quantization != "None":
47
  return f"{score:.2f}*"
48
  else:
49
  return f"{score:.2f} "
@@ -71,7 +71,7 @@ def process_quantizations(x):
71
  ):
72
  return "AWQ.4bit"
73
  else:
74
- return "None"
75
 
76
 
77
  def process_kernels(x):
@@ -97,7 +97,7 @@ def process_kernels(x):
97
  ):
98
  return "AWQ.GEMV"
99
  else:
100
- return "None"
101
 
102
 
103
  # def change_tab(query_param):
 
43
 
44
 
45
  def process_score(score, quantization):
46
+ if quantization != "Unquantized":
47
  return f"{score:.2f}*"
48
  else:
49
  return f"{score:.2f} "
 
71
  ):
72
  return "AWQ.4bit"
73
  else:
74
+ return "Unquantized"
75
 
76
 
77
  def process_kernels(x):
 
97
  ):
98
  return "AWQ.GEMV"
99
  else:
100
+ return "Unquantized"
101
 
102
 
103
  # def change_tab(query_param):