Alikestocode commited on
Commit
b36a0b0
Β·
1 Parent(s): f8c20fd

Update Gemma model to use AWQ quantized version

Browse files

- Change repo_id from router-gemma3-merged to router-gemma3-merged-awq
- vLLM will auto-detect AWQ quantization from quantization_config.json
- Weights in 'default' subfolder are handled automatically by vLLM

Files changed (1) hide show
  1. app.py +6 -2
app.py CHANGED
@@ -111,7 +111,7 @@ MODELS = {
111
  "quantization": "awq", # vLLM will auto-detect AWQ
112
  },
113
  "Router-Gemma3-27B-AWQ": {
114
- "repo_id": "Alovestocode/router-gemma3-merged",
115
  "description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
116
  "params_b": 27.0,
117
  "quantization": "awq", # vLLM will auto-detect AWQ
@@ -200,16 +200,20 @@ def load_vllm_model(model_name: str):
200
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
201
  if quantization == "awq":
202
  llm_kwargs["quantization"] = "awq"
203
- # vLLM will auto-detect AWQ weights if present (handled by llm-compressor)
 
 
204
  # Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
205
  # FP8 KV cache is compatible with AWQ quantization
206
  try:
207
  llm_kwargs["kv_cache_dtype"] = "fp8"
208
  print(f" β†’ AWQ quantization + FP8 KV cache enabled (vLLM native support)")
209
  print(f" β†’ FP8 KV cache reduces memory by ~50%, enabling longer contexts")
 
210
  except Exception:
211
  # Fallback if FP8 KV cache not supported
212
  print(f" β†’ AWQ quantization enabled (FP8 KV cache not available)")
 
213
  elif quantization == "fp8":
214
  # Try FP8 quantization if available (faster than AWQ)
215
  try:
 
111
  "quantization": "awq", # vLLM will auto-detect AWQ
112
  },
113
  "Router-Gemma3-27B-AWQ": {
114
+ "repo_id": "Alovestocode/router-gemma3-merged-awq", # AWQ quantized model
115
  "description": "Router checkpoint on Gemma3 27B merged, optimized with AWQ quantization via vLLM.",
116
  "params_b": 27.0,
117
  "quantization": "awq", # vLLM will auto-detect AWQ
 
200
  # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
201
  if quantization == "awq":
202
  llm_kwargs["quantization"] = "awq"
203
+ # vLLM will auto-detect AWQ weights from quantization_config.json at repo root
204
+ # Weights may be in a 'default' subfolder (LLM Compressor stage structure)
205
+ # vLLM handles this automatically via the quantization config
206
  # Enable FP8 KV cache for 50% memory reduction (allows longer contexts)
207
  # FP8 KV cache is compatible with AWQ quantization
208
  try:
209
  llm_kwargs["kv_cache_dtype"] = "fp8"
210
  print(f" β†’ AWQ quantization + FP8 KV cache enabled (vLLM native support)")
211
  print(f" β†’ FP8 KV cache reduces memory by ~50%, enabling longer contexts")
212
+ print(f" β†’ Loading AWQ model from: {repo}")
213
  except Exception:
214
  # Fallback if FP8 KV cache not supported
215
  print(f" β†’ AWQ quantization enabled (FP8 KV cache not available)")
216
+ print(f" β†’ Loading AWQ model from: {repo}")
217
  elif quantization == "fp8":
218
  # Try FP8 quantization if available (faster than AWQ)
219
  try: