Alikestocode commited on
Commit
83a232d
Β·
1 Parent(s): a79facb

Suppress AutoAWQ deprecation warnings and improve vLLM logging

Browse files

- Filter AutoAWQ deprecation warnings (vLLM handles AWQ natively via llm-compressor)
- Improve vLLM loading logs with detailed status information
- Add better error handling with traceback for debugging
- Clarify that vLLM is primary path (AutoAWQ only for Transformers fallback)

Files changed (1) hide show
  1. app.py +41 -26
app.py CHANGED
@@ -34,13 +34,16 @@ except ImportError:
34
  LLM_COMPRESSOR_AVAILABLE = False
35
  print("Warning: LLM Compressor not available (models should be pre-quantized)")
36
 
37
- # Try to import AWQ, fallback to BitsAndBytes if not available
 
38
  try:
39
  from awq import AutoAWQForCausalLM
40
  AWQ_AVAILABLE = True
 
 
41
  except ImportError:
42
  AWQ_AVAILABLE = False
43
- print("Warning: AutoAWQ not available, falling back to BitsAndBytes")
44
 
45
  # Always import BitsAndBytesConfig for fallback
46
  try:
@@ -134,6 +137,7 @@ def load_vllm_model(model_name: str):
134
 
135
  try:
136
  # vLLM configuration optimized for ZeroGPU H200 slice
 
137
  llm_kwargs = {
138
  "model": repo,
139
  "trust_remote_code": True,
@@ -147,17 +151,24 @@ def load_vllm_model(model_name: str):
147
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
148
  }
149
 
150
- # Add quantization if specified (vLLM auto-detects AWQ)
151
  if quantization == "awq":
152
  llm_kwargs["quantization"] = "awq"
153
- # vLLM will auto-detect AWQ weights if present
 
154
 
 
155
  llm = LLM(**llm_kwargs)
156
  VLLM_MODELS[model_name] = llm
157
- print(f"βœ… vLLM model loaded: {model_name} (continuous batching enabled)")
 
 
 
158
  return llm
159
  except Exception as exc:
160
  print(f"❌ vLLM load failed for {repo}: {exc}")
 
 
161
  raise
162
 
163
 
@@ -202,12 +213,16 @@ def load_awq_pipeline(repo: str, tokenizer):
202
 
203
  def load_pipeline(model_name: str):
204
  """Load model with vLLM (preferred) or Transformers (fallback)."""
205
- # Try vLLM first (best performance with AWQ support)
 
206
  if VLLM_AVAILABLE:
207
  try:
 
208
  return load_vllm_model(model_name)
209
  except Exception as exc:
210
- print(f"vLLM load failed, falling back to Transformers: {exc}")
 
 
211
 
212
  # Fallback to Transformers pipeline
213
  if model_name in PIPELINES:
@@ -237,14 +252,14 @@ def load_pipeline(model_name: str):
237
  if FLASH_ATTN_AVAILABLE:
238
  model_kwargs["attn_implementation"] = "flash_attention_2"
239
 
240
- pipe = pipeline(
241
- task="text-generation",
242
- model=repo,
243
- tokenizer=tokenizer,
244
- trust_remote_code=True,
245
- device_map="auto",
246
  model_kwargs=model_kwargs,
247
- use_cache=True,
248
  token=HF_TOKEN,
249
  torch_dtype=torch.bfloat16,
250
  )
@@ -257,11 +272,11 @@ def load_pipeline(model_name: str):
257
  except Exception:
258
  pass
259
 
260
- PIPELINES[model_name] = pipe
261
  _schedule_background_warm(model_name)
262
- return pipe
263
- except Exception as exc:
264
- print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
265
 
266
  # Fallback to bfloat16/fp16/fp32
267
  for dtype in (torch.bfloat16, torch.float16, torch.float32):
@@ -642,8 +657,8 @@ def _generate_router_plan_streaming_internal(
642
 
643
  thread = Thread(target=_generate)
644
  thread.start()
645
-
646
- # Stream tokens
647
  completion = ""
648
  parsed_plan: Dict[str, Any] | None = None
649
  validation_msg = "πŸ”„ Generating..."
@@ -671,8 +686,8 @@ def _generate_router_plan_streaming_internal(
671
 
672
  if finished:
673
  completion = chunk
674
- break
675
-
676
  # Final processing after streaming completes
677
  thread.join()
678
 
@@ -772,9 +787,9 @@ def build_ui():
772
  """) as demo:
773
  gr.Markdown("# πŸ›°οΈ Router Control Room β€” ZeroGPU" )
774
  gr.Markdown(description)
775
-
776
- with gr.Row():
777
- with gr.Column(scale=3):
778
  user_task = gr.Textbox(
779
  label="User Task / Problem Statement",
780
  placeholder="Describe the homework-style query that needs routing...",
@@ -823,7 +838,7 @@ def build_ui():
823
  generate_btn = gr.Button("Generate Router Plan", variant="primary")
824
  clear_btn = gr.Button("Clear", variant="secondary")
825
 
826
- with gr.Row():
827
  raw_output = gr.Textbox(label="Raw Model Output", lines=12)
828
  plan_json = gr.JSON(label="Parsed Router Plan")
829
  validation_msg = gr.Markdown("Awaiting generation.")
 
34
  LLM_COMPRESSOR_AVAILABLE = False
35
  print("Warning: LLM Compressor not available (models should be pre-quantized)")
36
 
37
+ # Try to import AWQ (deprecated, but kept for fallback compatibility)
38
+ # Note: AutoAWQ is deprecated; vLLM handles AWQ natively via llm-compressor
39
  try:
40
  from awq import AutoAWQForCausalLM
41
  AWQ_AVAILABLE = True
42
+ import warnings
43
+ warnings.filterwarnings("ignore", category=DeprecationWarning, module="awq")
44
  except ImportError:
45
  AWQ_AVAILABLE = False
46
+ print("Info: AutoAWQ not available (using vLLM native AWQ support instead)")
47
 
48
  # Always import BitsAndBytesConfig for fallback
49
  try:
 
137
 
138
  try:
139
  # vLLM configuration optimized for ZeroGPU H200 slice
140
+ # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
141
  llm_kwargs = {
142
  "model": repo,
143
  "trust_remote_code": True,
 
151
  "enable_prefix_caching": True, # Cache prompts for faster TTFT
152
  }
153
 
154
+ # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
155
  if quantization == "awq":
156
  llm_kwargs["quantization"] = "awq"
157
+ # vLLM will auto-detect AWQ weights if present (handled by llm-compressor)
158
+ print(f" β†’ AWQ quantization enabled (vLLM native support)")
159
 
160
+ print(f" β†’ Loading with vLLM (continuous batching, PagedAttention)...")
161
  llm = LLM(**llm_kwargs)
162
  VLLM_MODELS[model_name] = llm
163
+ print(f"βœ… vLLM model loaded: {model_name}")
164
+ print(f" - Continuous batching: enabled (max {llm_kwargs['max_num_seqs']} concurrent)")
165
+ print(f" - Prefix caching: enabled")
166
+ print(f" - Quantization: {quantization or 'none (bf16)'}")
167
  return llm
168
  except Exception as exc:
169
  print(f"❌ vLLM load failed for {repo}: {exc}")
170
+ import traceback
171
+ traceback.print_exc()
172
  raise
173
 
174
 
 
213
 
214
  def load_pipeline(model_name: str):
215
  """Load model with vLLM (preferred) or Transformers (fallback)."""
216
+ # Try vLLM first (best performance with native AWQ support via llm-compressor)
217
+ # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
218
  if VLLM_AVAILABLE:
219
  try:
220
+ print(f"Attempting to load {model_name} with vLLM (native AWQ support)...")
221
  return load_vllm_model(model_name)
222
  except Exception as exc:
223
+ print(f"⚠️ vLLM load failed, falling back to Transformers: {exc}")
224
+ import traceback
225
+ traceback.print_exc()
226
 
227
  # Fallback to Transformers pipeline
228
  if model_name in PIPELINES:
 
252
  if FLASH_ATTN_AVAILABLE:
253
  model_kwargs["attn_implementation"] = "flash_attention_2"
254
 
255
+ pipe = pipeline(
256
+ task="text-generation",
257
+ model=repo,
258
+ tokenizer=tokenizer,
259
+ trust_remote_code=True,
260
+ device_map="auto",
261
  model_kwargs=model_kwargs,
262
+ use_cache=True,
263
  token=HF_TOKEN,
264
  torch_dtype=torch.bfloat16,
265
  )
 
272
  except Exception:
273
  pass
274
 
275
+ PIPELINES[model_name] = pipe
276
  _schedule_background_warm(model_name)
277
+ return pipe
278
+ except Exception as exc:
279
+ print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
280
 
281
  # Fallback to bfloat16/fp16/fp32
282
  for dtype in (torch.bfloat16, torch.float16, torch.float32):
 
657
 
658
  thread = Thread(target=_generate)
659
  thread.start()
660
+
661
+ # Stream tokens
662
  completion = ""
663
  parsed_plan: Dict[str, Any] | None = None
664
  validation_msg = "πŸ”„ Generating..."
 
686
 
687
  if finished:
688
  completion = chunk
689
+ break
690
+
691
  # Final processing after streaming completes
692
  thread.join()
693
 
 
787
  """) as demo:
788
  gr.Markdown("# πŸ›°οΈ Router Control Room β€” ZeroGPU" )
789
  gr.Markdown(description)
790
+
791
+ with gr.Row():
792
+ with gr.Column(scale=3):
793
  user_task = gr.Textbox(
794
  label="User Task / Problem Statement",
795
  placeholder="Describe the homework-style query that needs routing...",
 
838
  generate_btn = gr.Button("Generate Router Plan", variant="primary")
839
  clear_btn = gr.Button("Clear", variant="secondary")
840
 
841
+ with gr.Row():
842
  raw_output = gr.Textbox(label="Raw Model Output", lines=12)
843
  plan_json = gr.JSON(label="Parsed Router Plan")
844
  validation_msg = gr.Markdown("Awaiting generation.")