bullpoint
/

GLM-4.6-AWQ

@@ -42,7 +42,7 @@ This is a **professionally quantized 4-bit AWQ version** of [Z.ai's GLM-4.6](htt
 - **License**: MIT (inherited from base model)
 - **Quantization**: AWQ 4-bit with group size 128
 - **Active Parameters**: 28.72B per token (8 of 160 experts)
-- **Quantization Framework**: llm-compressor 0.12.2
 - **Optimization**: Marlin kernels for NVIDIA GPUs
 - **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
 - **Languages**: English, Chinese
@@ -221,36 +221,67 @@ print(response.choices[0].message.content)
 This model was quantized using the following configuration:
 ```python
-from llmcompressor.transformers import oneshot
 from datasets import load_dataset
 # Load calibration data from Neural Magic's curated dataset
 dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
 dataset = dataset.shuffle(seed=42).select(range(512))
 # AWQ quantization recipe
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            ignore: ["lm_head"]
-            config_groups:
-                group_0:
-                    weights:
-                        num_bits: 4
-                        type: "int"
-                        symmetric: true
-                        group_size: 128
-                        strategy: "group"
-            targets: ["Linear"]
-"""
 # Apply quantization
 oneshot(
-    model="zai-org/GLM-4.6",
     dataset=dataset,
     recipe=recipe,
-    output_dir="./GLM-4.6-AWQ",
     max_seq_length=2048,
     num_calibration_samples=512
 )

 - **License**: MIT (inherited from base model)
 - **Quantization**: AWQ 4-bit with group size 128
 - **Active Parameters**: 28.72B per token (8 of 160 experts)
+- **Quantization Framework**: llmcompressor 0.8.1.dev0
 - **Optimization**: Marlin kernels for NVIDIA GPUs
 - **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
 - **Languages**: English, Chinese
 This model was quantized using the following configuration:
 ```python
+from llmcompressor import oneshot
+from llmcompressor.modifiers.awq import AWQModifier
 from datasets import load_dataset
 # Load calibration data from Neural Magic's curated dataset
 dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
 dataset = dataset.shuffle(seed=42).select(range(512))
+# Define ignore patterns and targets
+ignore_patterns = [
+    "lm_head",
+    "model.embed_tokens",
+    "re:.*input_layernorm$",
+    "re:.*post_attention_layernorm$",
+    "model.norm",
+    "re:.*q_norm$",
+    "re:.*k_norm$",
+    "re:.*shared_experts.*",
+    "re:.*mlp\\.gate\\.weight$",
+    "re:.*mlp\\.gate\\..*bias$",
+    "re:model.layers.[0-2]\\.",
+]
+targets = [
+    "re:.*gate_proj.*",
+    "re:.*up_proj.*",
+    "re:.*down_proj.*",
+    "re:.*k_proj.*",
+    "re:.*q_proj.*",
+    "re:.*v_proj.*",
+    "re:.*o_proj.*",
+]
 # AWQ quantization recipe
+recipe = [
+    AWQModifier(
+        ignore=ignore_patterns,
+        config_groups={
+            "group_0": {
+                "targets": targets,
+                "weights": {
+                    "num_bits": 4,
+                    "type": "int",
+                    "symmetric": True,
+                    "group_size": 128,
+                    "strategy": "group",
+                    "dynamic": False,
+                },
+                "input_activations": None,
+                "output_activations": None,
+                "format": None,
+            }
+        },
+    )
+]
 # Apply quantization
 oneshot(
+    model=model,  # Pre-loaded AutoModelForCausalLM
     dataset=dataset,
     recipe=recipe,
     max_seq_length=2048,
     num_calibration_samples=512
 )