nm-testing
/

llama2-7b-gsm8k-pt-pruned50-quant-ds

Text Generation

text-generation-inference

Model card Files Files and versions Community

mwitiderrick commited on Feb 2

Commit

6f7231f

•

1 Parent(s): 470d8d3

Create quantize.yaml

Files changed (1) hide show

quantize.yaml +41 -0

quantize.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+test_stage:
+  obcq_modifiers:
+    LogarithmicEqualizationModifier:
+      mappings: [
+        [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
+        [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
+      ]
+    QuantizationModifier:
+      ignore:
+        # These operations don't make sense to quantize
+        - LlamaRotaryEmbedding
+        - LlamaRMSNorm
+        - SiLUActivation
+        - MatMulOutput_QK
+        - MatMulOutput_PV
+        # Skip quantizing the layers with the most sensitive activations
+        - model.layers.1.mlp.down_proj
+        - model.layers.30.mlp.down_proj
+        - model.layers.31.mlp.down_proj
+        - model.layers.28.mlp.down_proj
+        - model.layers.29.mlp.down_proj
+      post_oneshot_calibration: true
+      scheme_overrides:
+        Linear:
+          weights:
+            num_bits: 8
+            symmetric: true
+            strategy: channel
+        MatMulLeftInput_QK:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        MatMulLeftInput_PV:
+          input_activations:
+            num_bits: 8
+            symmetric: true
+        Embedding:
+          input_activations: null
+          weights:
+            num_bits: 8
+            symmetric: false