bullpoint commited on
Commit
ff8307c
·
verified ·
1 Parent(s): d578beb

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +50 -19
README.md CHANGED
@@ -42,7 +42,7 @@ This is a **professionally quantized 4-bit AWQ version** of [Z.ai's GLM-4.6](htt
42
  - **License**: MIT (inherited from base model)
43
  - **Quantization**: AWQ 4-bit with group size 128
44
  - **Active Parameters**: 28.72B per token (8 of 160 experts)
45
- - **Quantization Framework**: llm-compressor 0.12.2
46
  - **Optimization**: Marlin kernels for NVIDIA GPUs
47
  - **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
48
  - **Languages**: English, Chinese
@@ -221,36 +221,67 @@ print(response.choices[0].message.content)
221
  This model was quantized using the following configuration:
222
 
223
  ```python
224
- from llmcompressor.transformers import oneshot
 
225
  from datasets import load_dataset
226
 
227
  # Load calibration data from Neural Magic's curated dataset
228
  dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
229
  dataset = dataset.shuffle(seed=42).select(range(512))
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  # AWQ quantization recipe
232
- recipe = """
233
- quant_stage:
234
- quant_modifiers:
235
- QuantizationModifier:
236
- ignore: ["lm_head"]
237
- config_groups:
238
- group_0:
239
- weights:
240
- num_bits: 4
241
- type: "int"
242
- symmetric: true
243
- group_size: 128
244
- strategy: "group"
245
- targets: ["Linear"]
246
- """
 
 
 
 
 
 
247
 
248
  # Apply quantization
249
  oneshot(
250
- model="zai-org/GLM-4.6",
251
  dataset=dataset,
252
  recipe=recipe,
253
- output_dir="./GLM-4.6-AWQ",
254
  max_seq_length=2048,
255
  num_calibration_samples=512
256
  )
 
42
  - **License**: MIT (inherited from base model)
43
  - **Quantization**: AWQ 4-bit with group size 128
44
  - **Active Parameters**: 28.72B per token (8 of 160 experts)
45
+ - **Quantization Framework**: llmcompressor 0.8.1.dev0
46
  - **Optimization**: Marlin kernels for NVIDIA GPUs
47
  - **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
48
  - **Languages**: English, Chinese
 
221
  This model was quantized using the following configuration:
222
 
223
  ```python
224
+ from llmcompressor import oneshot
225
+ from llmcompressor.modifiers.awq import AWQModifier
226
  from datasets import load_dataset
227
 
228
  # Load calibration data from Neural Magic's curated dataset
229
  dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
230
  dataset = dataset.shuffle(seed=42).select(range(512))
231
 
232
+ # Define ignore patterns and targets
233
+ ignore_patterns = [
234
+ "lm_head",
235
+ "model.embed_tokens",
236
+ "re:.*input_layernorm$",
237
+ "re:.*post_attention_layernorm$",
238
+ "model.norm",
239
+ "re:.*q_norm$",
240
+ "re:.*k_norm$",
241
+ "re:.*shared_experts.*",
242
+ "re:.*mlp\\.gate\\.weight$",
243
+ "re:.*mlp\\.gate\\..*bias$",
244
+ "re:model.layers.[0-2]\\.",
245
+ ]
246
+
247
+ targets = [
248
+ "re:.*gate_proj.*",
249
+ "re:.*up_proj.*",
250
+ "re:.*down_proj.*",
251
+ "re:.*k_proj.*",
252
+ "re:.*q_proj.*",
253
+ "re:.*v_proj.*",
254
+ "re:.*o_proj.*",
255
+ ]
256
+
257
  # AWQ quantization recipe
258
+ recipe = [
259
+ AWQModifier(
260
+ ignore=ignore_patterns,
261
+ config_groups={
262
+ "group_0": {
263
+ "targets": targets,
264
+ "weights": {
265
+ "num_bits": 4,
266
+ "type": "int",
267
+ "symmetric": True,
268
+ "group_size": 128,
269
+ "strategy": "group",
270
+ "dynamic": False,
271
+ },
272
+ "input_activations": None,
273
+ "output_activations": None,
274
+ "format": None,
275
+ }
276
+ },
277
+ )
278
+ ]
279
 
280
  # Apply quantization
281
  oneshot(
282
+ model=model, # Pre-loaded AutoModelForCausalLM
283
  dataset=dataset,
284
  recipe=recipe,
 
285
  max_seq_length=2048,
286
  num_calibration_samples=512
287
  )