mwitiderrick commited on
Commit
ce24af6
1 Parent(s): a1c669c

Update recipe.yaml

Browse files
Files changed (1) hide show
  1. recipe.yaml +30 -16
recipe.yaml CHANGED
@@ -1,27 +1,41 @@
1
  test_stage:
2
  obcq_modifiers:
3
- SmoothQuantModifier:
4
- smoothing_strength: 0.8
5
  mappings: [
6
  [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
7
- [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
8
- ]
9
  QuantizationModifier:
10
  ignore:
11
- # These operations don't make sense to quantize
12
- - LlamaRotaryEmbedding
13
- - LlamaRMSNorm
14
- - SiLUActivation
15
- # Skip quantizing the BMMs
16
- - QuantizableMatMul
17
- # Skip quantizing the layers with the most sensitive activations
18
- - model.layers.21.mlp.down_proj
19
- - model.layers.7.mlp.down_proj
20
- - model.layers.2.mlp.down_proj
21
- - model.layers.8.self_attn.q_proj
22
- - model.layers.8.self_attn.k_proj
23
  post_oneshot_calibration: true
24
  scheme_overrides:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  Embedding:
26
  input_activations: null
27
  weights:
 
1
  test_stage:
2
  obcq_modifiers:
3
+ LogarithmicEqualizationModifier:
 
4
  mappings: [
5
  [["re:.*q_proj", "re:.*k_proj", "re:.*v_proj"], "re:.*input_layernorm"],
6
+ [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"],
7
+ ]
8
  QuantizationModifier:
9
  ignore:
10
+ # These operations don't make sense to quantize
11
+ - LlamaRotaryEmbedding
12
+ - LlamaRMSNorm
13
+ - SiLUActivation
14
+ - MatMulOutput_QK
15
+ - MatMulOutput_PV
16
+ # Skip quantizing the layers with the most sensitive activations
17
+ - model.layers.21.mlp.down_proj
18
+ - model.layers.7.mlp.down_proj
19
+ - model.layers.2.mlp.down_proj
20
+ - model.layers.8.self_attn.q_proj
21
+ - model.layers.8.self_attn.k_proj
22
  post_oneshot_calibration: true
23
  scheme_overrides:
24
+ # Enable channelwise quantization for better accuracy
25
+ Linear:
26
+ weights:
27
+ num_bits: 8
28
+ symmetric: true
29
+ strategy: channel
30
+ MatMulLeftInput_QK:
31
+ input_activations:
32
+ num_bits: 8
33
+ symmetric: true
34
+ MatMulLeftInput_PV:
35
+ input_activations:
36
+ num_bits: 8
37
+ symmetric: true
38
+ # For the embeddings, only weight-quantization makes sense
39
  Embedding:
40
  input_activations: null
41
  weights: