Spaces:
Sleeping
Sleeping
Commit
Β·
b36a0b0
1
Parent(s):
f8c20fd
Update Gemma model to use AWQ quantized version
Browse files- Change repo_id from router-gemma3-merged to router-gemma3-merged-awq
- vLLM will auto-detect AWQ quantization from quantization_config.json
- Weights in 'default' subfolder are handled automatically by vLLM
app.py
CHANGED
|
@@ -111,7 +111,7 @@ MODELS = {
|
|
| 111 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
| 112 |
},
|
| 113 |
"Router-Gemma3-27B-AWQ": {
|
| 114 |
-
"repo_id": "Alovestocode/router-gemma3-merged",
|
| 115 |
"description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
|
| 116 |
"params_b": 27.0,
|
| 117 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
|
@@ -200,16 +200,20 @@ def load_vllm_model(model_name: str):
|
|
| 200 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 201 |
if quantization == "awq":
|
| 202 |
llm_kwargs["quantization"] = "awq"
|
| 203 |
-
# vLLM will auto-detect AWQ weights
|
|
|
|
|
|
|
| 204 |
# Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
|
| 205 |
# FP8 KV cache is compatible with AWQ quantization
|
| 206 |
try:
|
| 207 |
llm_kwargs["kv_cache_dtype"] = "fp8"
|
| 208 |
print(f" β AWQ quantization + FP8 KV cache enabled (vLLM native support)")
|
| 209 |
print(f" β FP8 KV cache reduces memory by ~50%, enabling longer contexts")
|
|
|
|
| 210 |
except Exception:
|
| 211 |
# Fallback if FP8 KV cache not supported
|
| 212 |
print(f" β AWQ quantization enabled (FP8 KV cache not available)")
|
|
|
|
| 213 |
elif quantization == "fp8":
|
| 214 |
# Try FP8 quantization if available (faster than AWQ)
|
| 215 |
try:
|
|
|
|
| 111 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
| 112 |
},
|
| 113 |
"Router-Gemma3-27B-AWQ": {
|
| 114 |
+
"repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
|
| 115 |
"description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
|
| 116 |
"params_b": 27.0,
|
| 117 |
"quantization": "awq", # vLLM will auto-detect AWQ
|
|
|
|
| 200 |
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 201 |
if quantization == "awq":
|
| 202 |
llm_kwargs["quantization"] = "awq"
|
| 203 |
+
# vLLM will auto-detect AWQ weights from quantization_config.json at repo root
|
| 204 |
+
# Weights may be in a 'default' subfolder (LLM Compressor stage structure)
|
| 205 |
+
# vLLM handles this automatically via the quantization config
|
| 206 |
# Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
|
| 207 |
# FP8 KV cache is compatible with AWQ quantization
|
| 208 |
try:
|
| 209 |
llm_kwargs["kv_cache_dtype"] = "fp8"
|
| 210 |
print(f" β AWQ quantization + FP8 KV cache enabled (vLLM native support)")
|
| 211 |
print(f" β FP8 KV cache reduces memory by ~50%, enabling longer contexts")
|
| 212 |
+
print(f" β Loading AWQ model from: {repo}")
|
| 213 |
except Exception:
|
| 214 |
# Fallback if FP8 KV cache not supported
|
| 215 |
print(f" β AWQ quantization enabled (FP8 KV cache not available)")
|
| 216 |
+
print(f" β Loading AWQ model from: {repo}")
|
| 217 |
elif quantization == "fp8":
|
| 218 |
# Try FP8 quantization if available (faster than AWQ)
|
| 219 |
try:
|