Spaces:
Sleeping
Sleeping
Commit
Β·
83a232d
1
Parent(s):
a79facb
Suppress AutoAWQ deprecation warnings and improve vLLM logging
Browse files- Filter AutoAWQ deprecation warnings (vLLM handles AWQ natively via llm-compressor)
- Improve vLLM loading logs with detailed status information
- Add better error handling with traceback for debugging
- Clarify that vLLM is primary path (AutoAWQ only for Transformers fallback)
app.py
CHANGED
|
@@ -34,13 +34,16 @@ except ImportError:
|
|
| 34 |
LLM_COMPRESSOR_AVAILABLE = False
|
| 35 |
print("Warning: LLM Compressor not available (models should be pre-quantized)")
|
| 36 |
|
| 37 |
-
# Try to import AWQ,
|
|
|
|
| 38 |
try:
|
| 39 |
from awq import AutoAWQForCausalLM
|
| 40 |
AWQ_AVAILABLE = True
|
|
|
|
|
|
|
| 41 |
except ImportError:
|
| 42 |
AWQ_AVAILABLE = False
|
| 43 |
-
print("
|
| 44 |
|
| 45 |
# Always import BitsAndBytesConfig for fallback
|
| 46 |
try:
|
|
@@ -134,6 +137,7 @@ def load_vllm_model(model_name: str):
|
|
| 134 |
|
| 135 |
try:
|
| 136 |
# vLLM configuration optimized for ZeroGPU H200 slice
|
|
|
|
| 137 |
llm_kwargs = {
|
| 138 |
"model": repo,
|
| 139 |
"trust_remote_code": True,
|
|
@@ -147,17 +151,24 @@ def load_vllm_model(model_name: str):
|
|
| 147 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 148 |
}
|
| 149 |
|
| 150 |
-
# Add quantization if specified (vLLM auto-detects AWQ)
|
| 151 |
if quantization == "awq":
|
| 152 |
llm_kwargs["quantization"] = "awq"
|
| 153 |
-
# vLLM will auto-detect AWQ weights if present
|
|
|
|
| 154 |
|
|
|
|
| 155 |
llm = LLM(**llm_kwargs)
|
| 156 |
VLLM_MODELS[model_name] = llm
|
| 157 |
-
print(f"β
vLLM model loaded: {model_name}
|
|
|
|
|
|
|
|
|
|
| 158 |
return llm
|
| 159 |
except Exception as exc:
|
| 160 |
print(f"β vLLM load failed for {repo}: {exc}")
|
|
|
|
|
|
|
| 161 |
raise
|
| 162 |
|
| 163 |
|
|
@@ -202,12 +213,16 @@ def load_awq_pipeline(repo: str, tokenizer):
|
|
| 202 |
|
| 203 |
def load_pipeline(model_name: str):
|
| 204 |
"""Load model with vLLM (preferred) or Transformers (fallback)."""
|
| 205 |
-
# Try vLLM first (best performance with AWQ support)
|
|
|
|
| 206 |
if VLLM_AVAILABLE:
|
| 207 |
try:
|
|
|
|
| 208 |
return load_vllm_model(model_name)
|
| 209 |
except Exception as exc:
|
| 210 |
-
print(f"vLLM load failed, falling back to Transformers: {exc}")
|
|
|
|
|
|
|
| 211 |
|
| 212 |
# Fallback to Transformers pipeline
|
| 213 |
if model_name in PIPELINES:
|
|
@@ -237,14 +252,14 @@ def load_pipeline(model_name: str):
|
|
| 237 |
if FLASH_ATTN_AVAILABLE:
|
| 238 |
model_kwargs["attn_implementation"] = "flash_attention_2"
|
| 239 |
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
model_kwargs=model_kwargs,
|
| 247 |
-
|
| 248 |
token=HF_TOKEN,
|
| 249 |
torch_dtype=torch.bfloat16,
|
| 250 |
)
|
|
@@ -257,11 +272,11 @@ def load_pipeline(model_name: str):
|
|
| 257 |
except Exception:
|
| 258 |
pass
|
| 259 |
|
| 260 |
-
|
| 261 |
_schedule_background_warm(model_name)
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
|
| 266 |
# Fallback to bfloat16/fp16/fp32
|
| 267 |
for dtype in (torch.bfloat16, torch.float16, torch.float32):
|
|
@@ -642,8 +657,8 @@ def _generate_router_plan_streaming_internal(
|
|
| 642 |
|
| 643 |
thread = Thread(target=_generate)
|
| 644 |
thread.start()
|
| 645 |
-
|
| 646 |
-
|
| 647 |
completion = ""
|
| 648 |
parsed_plan: Dict[str, Any] | None = None
|
| 649 |
validation_msg = "π Generating..."
|
|
@@ -671,8 +686,8 @@ def _generate_router_plan_streaming_internal(
|
|
| 671 |
|
| 672 |
if finished:
|
| 673 |
completion = chunk
|
| 674 |
-
|
| 675 |
-
|
| 676 |
# Final processing after streaming completes
|
| 677 |
thread.join()
|
| 678 |
|
|
@@ -772,9 +787,9 @@ def build_ui():
|
|
| 772 |
""") as demo:
|
| 773 |
gr.Markdown("# π°οΈ Router Control Room β ZeroGPU" )
|
| 774 |
gr.Markdown(description)
|
| 775 |
-
|
| 776 |
-
|
| 777 |
-
|
| 778 |
user_task = gr.Textbox(
|
| 779 |
label="User Task / Problem Statement",
|
| 780 |
placeholder="Describe the homework-style query that needs routing...",
|
|
@@ -823,7 +838,7 @@ def build_ui():
|
|
| 823 |
generate_btn = gr.Button("Generate Router Plan", variant="primary")
|
| 824 |
clear_btn = gr.Button("Clear", variant="secondary")
|
| 825 |
|
| 826 |
-
|
| 827 |
raw_output = gr.Textbox(label="Raw Model Output", lines=12)
|
| 828 |
plan_json = gr.JSON(label="Parsed Router Plan")
|
| 829 |
validation_msg = gr.Markdown("Awaiting generation.")
|
|
|
|
| 34 |
LLM_COMPRESSOR_AVAILABLE = False
|
| 35 |
print("Warning: LLM Compressor not available (models should be pre-quantized)")
|
| 36 |
|
| 37 |
+
# Try to import AWQ (deprecated, but kept for fallback compatibility)
|
| 38 |
+
# Note: AutoAWQ is deprecated; vLLM handles AWQ natively via llm-compressor
|
| 39 |
try:
|
| 40 |
from awq import AutoAWQForCausalLM
|
| 41 |
AWQ_AVAILABLE = True
|
| 42 |
+
import warnings
|
| 43 |
+
warnings.filterwarnings("ignore", category=DeprecationWarning, module="awq")
|
| 44 |
except ImportError:
|
| 45 |
AWQ_AVAILABLE = False
|
| 46 |
+
print("Info: AutoAWQ not available (using vLLM native AWQ support instead)")
|
| 47 |
|
| 48 |
# Always import BitsAndBytesConfig for fallback
|
| 49 |
try:
|
|
|
|
| 137 |
|
| 138 |
try:
|
| 139 |
# vLLM configuration optimized for ZeroGPU H200 slice
|
| 140 |
+
# vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
|
| 141 |
llm_kwargs = {
|
| 142 |
"model": repo,
|
| 143 |
"trust_remote_code": True,
|
|
|
|
| 151 |
"enable_prefix_caching": True, # Cache prompts for faster TTFT
|
| 152 |
}
|
| 153 |
|
| 154 |
+
# Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
|
| 155 |
if quantization == "awq":
|
| 156 |
llm_kwargs["quantization"] = "awq"
|
| 157 |
+
# vLLM will auto-detect AWQ weights if present (handled by llm-compressor)
|
| 158 |
+
print(f" β AWQ quantization enabled (vLLM native support)")
|
| 159 |
|
| 160 |
+
print(f" β Loading with vLLM (continuous batching, PagedAttention)...")
|
| 161 |
llm = LLM(**llm_kwargs)
|
| 162 |
VLLM_MODELS[model_name] = llm
|
| 163 |
+
print(f"β
vLLM model loaded: {model_name}")
|
| 164 |
+
print(f" - Continuous batching: enabled (max {llm_kwargs['max_num_seqs']} concurrent)")
|
| 165 |
+
print(f" - Prefix caching: enabled")
|
| 166 |
+
print(f" - Quantization: {quantization or 'none (bf16)'}")
|
| 167 |
return llm
|
| 168 |
except Exception as exc:
|
| 169 |
print(f"β vLLM load failed for {repo}: {exc}")
|
| 170 |
+
import traceback
|
| 171 |
+
traceback.print_exc()
|
| 172 |
raise
|
| 173 |
|
| 174 |
|
|
|
|
| 213 |
|
| 214 |
def load_pipeline(model_name: str):
|
| 215 |
"""Load model with vLLM (preferred) or Transformers (fallback)."""
|
| 216 |
+
# Try vLLM first (best performance with native AWQ support via llm-compressor)
|
| 217 |
+
# vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
|
| 218 |
if VLLM_AVAILABLE:
|
| 219 |
try:
|
| 220 |
+
print(f"Attempting to load {model_name} with vLLM (native AWQ support)...")
|
| 221 |
return load_vllm_model(model_name)
|
| 222 |
except Exception as exc:
|
| 223 |
+
print(f"β οΈ vLLM load failed, falling back to Transformers: {exc}")
|
| 224 |
+
import traceback
|
| 225 |
+
traceback.print_exc()
|
| 226 |
|
| 227 |
# Fallback to Transformers pipeline
|
| 228 |
if model_name in PIPELINES:
|
|
|
|
| 252 |
if FLASH_ATTN_AVAILABLE:
|
| 253 |
model_kwargs["attn_implementation"] = "flash_attention_2"
|
| 254 |
|
| 255 |
+
pipe = pipeline(
|
| 256 |
+
task="text-generation",
|
| 257 |
+
model=repo,
|
| 258 |
+
tokenizer=tokenizer,
|
| 259 |
+
trust_remote_code=True,
|
| 260 |
+
device_map="auto",
|
| 261 |
model_kwargs=model_kwargs,
|
| 262 |
+
use_cache=True,
|
| 263 |
token=HF_TOKEN,
|
| 264 |
torch_dtype=torch.bfloat16,
|
| 265 |
)
|
|
|
|
| 272 |
except Exception:
|
| 273 |
pass
|
| 274 |
|
| 275 |
+
PIPELINES[model_name] = pipe
|
| 276 |
_schedule_background_warm(model_name)
|
| 277 |
+
return pipe
|
| 278 |
+
except Exception as exc:
|
| 279 |
+
print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
|
| 280 |
|
| 281 |
# Fallback to bfloat16/fp16/fp32
|
| 282 |
for dtype in (torch.bfloat16, torch.float16, torch.float32):
|
|
|
|
| 657 |
|
| 658 |
thread = Thread(target=_generate)
|
| 659 |
thread.start()
|
| 660 |
+
|
| 661 |
+
# Stream tokens
|
| 662 |
completion = ""
|
| 663 |
parsed_plan: Dict[str, Any] | None = None
|
| 664 |
validation_msg = "π Generating..."
|
|
|
|
| 686 |
|
| 687 |
if finished:
|
| 688 |
completion = chunk
|
| 689 |
+
break
|
| 690 |
+
|
| 691 |
# Final processing after streaming completes
|
| 692 |
thread.join()
|
| 693 |
|
|
|
|
| 787 |
""") as demo:
|
| 788 |
gr.Markdown("# π°οΈ Router Control Room β ZeroGPU" )
|
| 789 |
gr.Markdown(description)
|
| 790 |
+
|
| 791 |
+
with gr.Row():
|
| 792 |
+
with gr.Column(scale=3):
|
| 793 |
user_task = gr.Textbox(
|
| 794 |
label="User Task / Problem Statement",
|
| 795 |
placeholder="Describe the homework-style query that needs routing...",
|
|
|
|
| 838 |
generate_btn = gr.Button("Generate Router Plan", variant="primary")
|
| 839 |
clear_btn = gr.Button("Clear", variant="secondary")
|
| 840 |
|
| 841 |
+
with gr.Row():
|
| 842 |
raw_output = gr.Textbox(label="Raw Model Output", lines=12)
|
| 843 |
plan_json = gr.JSON(label="Parsed Router Plan")
|
| 844 |
validation_msg = gr.Markdown("Awaiting generation.")
|