Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -42,7 +42,7 @@ This is a **professionally quantized 4-bit AWQ version** of [Z.ai's GLM-4.6](htt
|
|
| 42 |
- **License**: MIT (inherited from base model)
|
| 43 |
- **Quantization**: AWQ 4-bit with group size 128
|
| 44 |
- **Active Parameters**: 28.72B per token (8 of 160 experts)
|
| 45 |
-
- **Quantization Framework**:
|
| 46 |
- **Optimization**: Marlin kernels for NVIDIA GPUs
|
| 47 |
- **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
|
| 48 |
- **Languages**: English, Chinese
|
|
@@ -221,36 +221,67 @@ print(response.choices[0].message.content)
|
|
| 221 |
This model was quantized using the following configuration:
|
| 222 |
|
| 223 |
```python
|
| 224 |
-
from llmcompressor
|
|
|
|
| 225 |
from datasets import load_dataset
|
| 226 |
|
| 227 |
# Load calibration data from Neural Magic's curated dataset
|
| 228 |
dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
|
| 229 |
dataset = dataset.shuffle(seed=42).select(range(512))
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
# AWQ quantization recipe
|
| 232 |
-
recipe =
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
# Apply quantization
|
| 249 |
oneshot(
|
| 250 |
-
model=
|
| 251 |
dataset=dataset,
|
| 252 |
recipe=recipe,
|
| 253 |
-
output_dir="./GLM-4.6-AWQ",
|
| 254 |
max_seq_length=2048,
|
| 255 |
num_calibration_samples=512
|
| 256 |
)
|
|
|
|
| 42 |
- **License**: MIT (inherited from base model)
|
| 43 |
- **Quantization**: AWQ 4-bit with group size 128
|
| 44 |
- **Active Parameters**: 28.72B per token (8 of 160 experts)
|
| 45 |
+
- **Quantization Framework**: llmcompressor 0.8.1.dev0
|
| 46 |
- **Optimization**: Marlin kernels for NVIDIA GPUs
|
| 47 |
- **Context Length**: Up to 200K tokens (131K recommended for optimal performance)
|
| 48 |
- **Languages**: English, Chinese
|
|
|
|
| 221 |
This model was quantized using the following configuration:
|
| 222 |
|
| 223 |
```python
|
| 224 |
+
from llmcompressor import oneshot
|
| 225 |
+
from llmcompressor.modifiers.awq import AWQModifier
|
| 226 |
from datasets import load_dataset
|
| 227 |
|
| 228 |
# Load calibration data from Neural Magic's curated dataset
|
| 229 |
dataset = load_dataset("neuralmagic/LLM_compression_calibration", split="train")
|
| 230 |
dataset = dataset.shuffle(seed=42).select(range(512))
|
| 231 |
|
| 232 |
+
# Define ignore patterns and targets
|
| 233 |
+
ignore_patterns = [
|
| 234 |
+
"lm_head",
|
| 235 |
+
"model.embed_tokens",
|
| 236 |
+
"re:.*input_layernorm$",
|
| 237 |
+
"re:.*post_attention_layernorm$",
|
| 238 |
+
"model.norm",
|
| 239 |
+
"re:.*q_norm$",
|
| 240 |
+
"re:.*k_norm$",
|
| 241 |
+
"re:.*shared_experts.*",
|
| 242 |
+
"re:.*mlp\\.gate\\.weight$",
|
| 243 |
+
"re:.*mlp\\.gate\\..*bias$",
|
| 244 |
+
"re:model.layers.[0-2]\\.",
|
| 245 |
+
]
|
| 246 |
+
|
| 247 |
+
targets = [
|
| 248 |
+
"re:.*gate_proj.*",
|
| 249 |
+
"re:.*up_proj.*",
|
| 250 |
+
"re:.*down_proj.*",
|
| 251 |
+
"re:.*k_proj.*",
|
| 252 |
+
"re:.*q_proj.*",
|
| 253 |
+
"re:.*v_proj.*",
|
| 254 |
+
"re:.*o_proj.*",
|
| 255 |
+
]
|
| 256 |
+
|
| 257 |
# AWQ quantization recipe
|
| 258 |
+
recipe = [
|
| 259 |
+
AWQModifier(
|
| 260 |
+
ignore=ignore_patterns,
|
| 261 |
+
config_groups={
|
| 262 |
+
"group_0": {
|
| 263 |
+
"targets": targets,
|
| 264 |
+
"weights": {
|
| 265 |
+
"num_bits": 4,
|
| 266 |
+
"type": "int",
|
| 267 |
+
"symmetric": True,
|
| 268 |
+
"group_size": 128,
|
| 269 |
+
"strategy": "group",
|
| 270 |
+
"dynamic": False,
|
| 271 |
+
},
|
| 272 |
+
"input_activations": None,
|
| 273 |
+
"output_activations": None,
|
| 274 |
+
"format": None,
|
| 275 |
+
}
|
| 276 |
+
},
|
| 277 |
+
)
|
| 278 |
+
]
|
| 279 |
|
| 280 |
# Apply quantization
|
| 281 |
oneshot(
|
| 282 |
+
model=model, # Pre-loaded AutoModelForCausalLM
|
| 283 |
dataset=dataset,
|
| 284 |
recipe=recipe,
|
|
|
|
| 285 |
max_seq_length=2048,
|
| 286 |
num_calibration_samples=512
|
| 287 |
)
|