Spaces:

optimum
/

llm-perf-leaderboard

Running

Add torchao int4 weight only quantization as an option

#34

by jerryzh168 - opened 20 days ago

←

Files changed (4) hide show

hardware.yaml CHANGED Viewed

@@ -19,6 +19,7 @@
     - awq
     - bnb
     - gptq
   backends:
     - pytorch
@@ -31,6 +32,7 @@
     - awq
     - bnb
     - gptq
   backends:
     - pytorch
@@ -45,4 +47,4 @@
   backends:
     - pytorch
     - openvino
-    - onnxruntime

     - awq
     - bnb
     - gptq
+    - torchao
   backends:
     - pytorch
     - awq
     - bnb
     - gptq
+    - torchao
   backends:
     - pytorch
   backends:
     - pytorch
     - openvino
+    - onnxruntime

src/kernels.py CHANGED Viewed

@@ -38,6 +38,7 @@ def get_quant_df(llm_perf_df):
     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     gemm_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMM")]
     gemv_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMV")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
         vanilla_df,
@@ -63,8 +64,14 @@ def get_quant_df(llm_perf_df):
         on=["Model 🤗"],
         suffixes=["", " Custom Kernel"],
     )
     # concat the two dataframes row-wise
-    quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df])
     # compute speedups
     quant_df["Prefill Speedup (%)"] = (
         (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100

     exllamav2_df = copy_df[(copy_df["Quantization 🗜️"] == "GPTQ.4bit+ExllamaV2")]
     gemm_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMM")]
     gemv_df = copy_df[(copy_df["Quantization 🗜️"] == "AWQ.4bit+GEMV")]
+    torchao_df = copy_df[(copy_df["Quantization 🗜️"] == "torchao.4bit")]
     # merge the three dataframes
     exllamav1_df = pd.merge(
         vanilla_df,
         on=["Model 🤗"],
         suffixes=["", " Custom Kernel"],
     )
+    torchao_df = pd.merge(
+        vanilla_df,
+        torchao_df,
+        on=["Model 🤗"],
+        suffixes=["", " Custom Kernel"],
+    )
     # concat the two dataframes row-wise
+    quant_df = pd.concat([exllamav1_df, exllamav2_df, gemm_df, gemv_df, torchao_df])
     # compute speedups
     quant_df["Prefill Speedup (%)"] = (
         (quant_df["Prefill (s)"] / quant_df["Prefill (s) Custom Kernel"]) * 100

src/panel.py CHANGED Viewed

@@ -26,7 +26,7 @@ def create_control_panel(
     if hardware_provider == "nvidia":
         backends = ["pytorch"]
         attention_implementations = ["Eager", "SDPA", "FAv2"]
-        quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit"]
         kernels = [
             "No Kernel",
             "GPTQ.ExllamaV1",

     if hardware_provider == "nvidia":
         backends = ["pytorch"]
         attention_implementations = ["Eager", "SDPA", "FAv2"]
+        quantizations = ["Unquantized", "BnB.4bit", "BnB.8bit", "AWQ.4bit", "GPTQ.4bit", "torchao.4bit"]
         kernels = [
             "No Kernel",
             "GPTQ.ExllamaV1",

src/utils.py CHANGED Viewed

@@ -70,6 +70,11 @@ def process_quantizations(x):
         and x["config.backend.quantization_config.bits"] == 4
     ):
         return "AWQ.4bit"
     else:
         return "Unquantized"

         and x["config.backend.quantization_config.bits"] == 4
     ):
         return "AWQ.4bit"
+    elif (
+            x["config.backend.quantization_scheme"] == "torchao"
+            and x["config.backend.quantization_config.quant_type"] == "int4_weight_only"
+    ):
+        return "torchao.4bit"
     else:
         return "Unquantized"