nm-testing
/

TinyLlama-1.1B-Chat-v1.0-pruned50-quant-ds

Text Generation

Model card Files Files and versions Community

mwitiderrick commited on Jan 4

Commit

ce24af6

•

1 Parent(s): a1c669c

Update recipe.yaml

Files changed (1) hide show

recipe.yaml +30 -16

recipe.yaml CHANGED Viewed

@@ -1,27 +1,41 @@
 test_stage:
   obcq_modifiers:
-    SmoothQuantModifier:
-      smoothing_strength: 0.8
       mappings: [
         [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
-        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
-      ]
     QuantizationModifier:
       ignore:
-      # These operations don't make sense to quantize
-      - LlamaRotaryEmbedding
-      - LlamaRMSNorm
-      - SiLUActivation
-      # Skip quantizing the BMMs
-      - QuantizableMatMul
-      # Skip quantizing the layers with the most sensitive activations
-      - model.layers.21.mlp.down_proj
-      - model.layers.7.mlp.down_proj
-      - model.layers.2.mlp.down_proj
-      - model.layers.8.self_attn.q_proj
-      - model.layers.8.self_attn.k_proj
       post_oneshot_calibration: true
       scheme_overrides:
         Embedding:
           input_activations: null
           weights:

 test_stage:
   obcq_modifiers:
+    LogarithmicEqualizationModifier:
       mappings: [
         [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
+      ]
     QuantizationModifier:
       ignore:
+        # These operations don't make sense to quantize
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - SiLUActivation
+        - MatMulOutput_QK
+        - MatMulOutput_PV
+        # Skip quantizing the layers with the most sensitive activations
+        - model.layers.21.mlp.down_proj
+        - model.layers.7.mlp.down_proj
+        - model.layers.2.mlp.down_proj
+        - model.layers.8.self_attn.q_proj
+        - model.layers.8.self_attn.k_proj
       post_oneshot_calibration: true
       scheme_overrides:
+        # Enable channelwise quantization for better accuracy
+        Linear:
+          weights:
+            num_bits: 8
+            symmetric: true
+            strategy: channel
+        MatMulLeftInput_QK:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        MatMulLeftInput_PV:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        # For the embeddings, only weight-quantization makes sense
         Embedding:
           input_activations: null
           weights: