Add torchao int4 weight only quantization as an option

#34
by jerryzh168 - opened
Files changed (4) hide show
  1. hardware.yaml +3 -1
  2. src/kernels.py +8 -1
  3. src/panel.py +1 -1
  4. src/utils.py +5 -0
hardware.yaml CHANGED
@@ -19,6 +19,7 @@
19
  - awq
20
  - bnb
21
  - gptq
 
22
  backends:
23
  - pytorch
24
 
@@ -31,6 +32,7 @@
31
  - awq
32
  - bnb
33
  - gptq
 
34
  backends:
35
  - pytorch
36
 
@@ -45,4 +47,4 @@
45
  backends:
46
  - pytorch
47
  - openvino
48
- - onnxruntime
 
19
  - awq
20
  - bnb
21
  - gptq
22
+ - torchao
23
  backends:
24
  - pytorch
25
 
 
32
  - awq
33
  - bnb
34
  - gptq
35
+ - torchao
36
  backends:
37
  - pytorch
38
 
 
47
  backends:
48
  - pytorch
49
  - openvino
50
+ - onnxruntime
src/kernels.py CHANGED
@@ -38,6 +38,7 @@ def get_quant_df(llm_perf_df):
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
39
  gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
40
  gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
 
41
  # merge the three dataframes
42
  exllamav1_df = pd.merge(
43
  vanilla_df,
@@ -63,8 +64,14 @@ def get_quant_df(llm_perf_df):
63
  on=["Model πŸ€—"],
64
  suffixes=["", " Custom Kernel"],
65
  )
 
 
 
 
 
 
66
  # concat the two dataframes row-wise
67
- quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
68
  # compute speedups
69
  quant_df["Prefill Speedup (%)"] = (
70
  (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
 
38
  exllamav2_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "GPTQ.4bit+ExllamaV2")]
39
  gemm_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMM")]
40
  gemv_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "AWQ.4bit+GEMV")]
41
+ torchao_df = copy_df[(copy_df["Quantization πŸ—œοΈ"] == "torchao.4bit")]
42
  # merge the three dataframes
43
  exllamav1_df = pd.merge(
44
  vanilla_df,
 
64
  on=["Model πŸ€—"],
65
  suffixes=["", " Custom Kernel"],
66
  )
67
+ torchao_df = pd.merge(
68
+ vanilla_df,
69
+ torchao_df,
70
+ on=["Model πŸ€—"],
71
+ suffixes=["", " Custom Kernel"],
72
+ )
73
  # concat the two dataframes row-wise
74
+ quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df, torchao_df])
75
  # compute speedups
76
  quant_df["Prefill Speedup (%)"] = (
77
  (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100
src/panel.py CHANGED
@@ -26,7 +26,7 @@ def create_control_panel(
26
  if hardware_provider == "nvidia":
27
  backends = ["pytorch"]
28
  attention_implementations = ["Eager", "SDPA", "FAv2"]
29
- quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit"]
30
  kernels = [
31
  "No Kernel",
32
  "GPTQ.ExllamaV1",
 
26
  if hardware_provider == "nvidia":
27
  backends = ["pytorch"]
28
  attention_implementations = ["Eager", "SDPA", "FAv2"]
29
+ quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
30
  kernels = [
31
  "No Kernel",
32
  "GPTQ.ExllamaV1",
src/utils.py CHANGED
@@ -70,6 +70,11 @@ def process_quantizations(x):
70
  and x["config.backend.quantization_config.bits"] == 4
71
  ):
72
  return "AWQ.4bit"
 
 
 
 
 
73
  else:
74
  return "Unquantized"
75
 
 
70
  and x["config.backend.quantization_config.bits"] == 4
71
  ):
72
  return "AWQ.4bit"
73
+ elif (
74
+ x["config.backend.quantization_scheme"] == "torchao"
75
+ and x["config.backend.quantization_config.quant_type"] == "int4_weight_only"
76
+ ):
77
+ return "torchao.4bit"
78
  else:
79
  return "Unquantized"
80