Spaces:

Alovestocode
/

ZeroGPU-LLM-Inference

Sleeping

Alikestocode commited on Nov 8

Commit

83a232d

1 Parent(s): a79facb

Suppress AutoAWQ deprecation warnings and improve vLLM logging

- Filter AutoAWQ deprecation warnings (vLLM handles AWQ natively via llm-compressor)
- Improve vLLM loading logs with detailed status information
- Add better error handling with traceback for debugging
- Clarify that vLLM is primary path (AutoAWQ only for Transformers fallback)

Files changed (1) hide show

app.py +41 -26

app.py CHANGED Viewed

@@ -34,13 +34,16 @@ except ImportError:
     LLM_COMPRESSOR_AVAILABLE = False
     print("Warning: LLM Compressor not available (models should be pre-quantized)")
-# Try to import AWQ, fallback to BitsAndBytes if not available
 try:
     from awq import AutoAWQForCausalLM
     AWQ_AVAILABLE = True
 except ImportError:
     AWQ_AVAILABLE = False
-    print("Warning: AutoAWQ not available, falling back to BitsAndBytes")
 # Always import BitsAndBytesConfig for fallback
 try:
@@ -134,6 +137,7 @@ def load_vllm_model(model_name: str):
     try:
         # vLLM configuration optimized for ZeroGPU H200 slice
         llm_kwargs = {
             "model": repo,
             "trust_remote_code": True,
@@ -147,17 +151,24 @@ def load_vllm_model(model_name: str):
             "enable_prefix_caching": True,  # Cache prompts for faster TTFT
         }
-        # Add quantization if specified (vLLM auto-detects AWQ)
         if quantization == "awq":
             llm_kwargs["quantization"] = "awq"
-            # vLLM will auto-detect AWQ weights if present
         llm = LLM(**llm_kwargs)
         VLLM_MODELS[model_name] = llm
-        print(f"✅ vLLM model loaded: {model_name} (continuous batching enabled)")
         return llm
     except Exception as exc:
         print(f"❌ vLLM load failed for {repo}: {exc}")
         raise
@@ -202,12 +213,16 @@ def load_awq_pipeline(repo: str, tokenizer):
 def load_pipeline(model_name: str):
     """Load model with vLLM (preferred) or Transformers (fallback)."""
-    # Try vLLM first (best performance with AWQ support)
     if VLLM_AVAILABLE:
         try:
             return load_vllm_model(model_name)
         except Exception as exc:
-            print(f"vLLM load failed, falling back to Transformers: {exc}")
     # Fallback to Transformers pipeline
     if model_name in PIPELINES:
@@ -237,14 +252,14 @@ def load_pipeline(model_name: str):
             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
-            pipe = pipeline(
-                task="text-generation",
-                model=repo,
-                tokenizer=tokenizer,
-                trust_remote_code=True,
-                device_map="auto",
                 model_kwargs=model_kwargs,
-                use_cache=True,
                 token=HF_TOKEN,
                 torch_dtype=torch.bfloat16,
             )
@@ -257,11 +272,11 @@ def load_pipeline(model_name: str):
             except Exception:
                 pass
-            PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
-            return pipe
-        except Exception as exc:
-            print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
     # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
@@ -642,8 +657,8 @@ def _generate_router_plan_streaming_internal(
             thread = Thread(target=_generate)
             thread.start()
-            # Stream tokens
             completion = ""
             parsed_plan: Dict[str, Any] | None = None
             validation_msg = "🔄 Generating..."
@@ -671,8 +686,8 @@ def _generate_router_plan_streaming_internal(
                 if finished:
                     completion = chunk
-                    break
             # Final processing after streaming completes
             thread.join()
@@ -772,9 +787,9 @@ def build_ui():
     """) as demo:
         gr.Markdown("# 🛰️ Router Control Room — ZeroGPU" )
         gr.Markdown(description)
-        with gr.Row():
-            with gr.Column(scale=3):
                 user_task = gr.Textbox(
                     label="User Task / Problem Statement",
                     placeholder="Describe the homework-style query that needs routing...",
@@ -823,7 +838,7 @@ def build_ui():
         generate_btn = gr.Button("Generate Router Plan", variant="primary")
         clear_btn = gr.Button("Clear", variant="secondary")
-        with gr.Row():
             raw_output = gr.Textbox(label="Raw Model Output", lines=12)
             plan_json = gr.JSON(label="Parsed Router Plan")
         validation_msg = gr.Markdown("Awaiting generation.")

     LLM_COMPRESSOR_AVAILABLE = False
     print("Warning: LLM Compressor not available (models should be pre-quantized)")
+# Try to import AWQ (deprecated, but kept for fallback compatibility)
+# Note: AutoAWQ is deprecated; vLLM handles AWQ natively via llm-compressor
 try:
     from awq import AutoAWQForCausalLM
     AWQ_AVAILABLE = True
+    import warnings
+    warnings.filterwarnings("ignore", category=DeprecationWarning, module="awq")
 except ImportError:
     AWQ_AVAILABLE = False
+    print("Info: AutoAWQ not available (using vLLM native AWQ support instead)")
 # Always import BitsAndBytesConfig for fallback
 try:
     try:
         # vLLM configuration optimized for ZeroGPU H200 slice
+        # vLLM natively supports AWQ via llm-compressor (replaces deprecated AutoAWQ)
         llm_kwargs = {
             "model": repo,
             "trust_remote_code": True,
             "enable_prefix_caching": True,  # Cache prompts for faster TTFT
         }
+        # Add quantization if specified (vLLM auto-detects AWQ via llm-compressor)
         if quantization == "awq":
             llm_kwargs["quantization"] = "awq"
+            # vLLM will auto-detect AWQ weights if present (handled by llm-compressor)
+            print(f"  → AWQ quantization enabled (vLLM native support)")
+        print(f"  → Loading with vLLM (continuous batching, PagedAttention)...")
         llm = LLM(**llm_kwargs)
         VLLM_MODELS[model_name] = llm
+        print(f"✅ vLLM model loaded: {model_name}")
+        print(f"   - Continuous batching: enabled (max {llm_kwargs['max_num_seqs']} concurrent)")
+        print(f"   - Prefix caching: enabled")
+        print(f"   - Quantization: {quantization or 'none (bf16)'}")
         return llm
     except Exception as exc:
         print(f"❌ vLLM load failed for {repo}: {exc}")
+        import traceback
+        traceback.print_exc()
         raise
 def load_pipeline(model_name: str):
     """Load model with vLLM (preferred) or Transformers (fallback)."""
+    # Try vLLM first (best performance with native AWQ support via llm-compressor)
+    # vLLM handles AWQ natively, so AutoAWQ deprecation doesn't affect us
     if VLLM_AVAILABLE:
         try:
+            print(f"Attempting to load {model_name} with vLLM (native AWQ support)...")
             return load_vllm_model(model_name)
         except Exception as exc:
+            print(f"⚠️ vLLM load failed, falling back to Transformers: {exc}")
+            import traceback
+            traceback.print_exc()
     # Fallback to Transformers pipeline
     if model_name in PIPELINES:
             if FLASH_ATTN_AVAILABLE:
                 model_kwargs["attn_implementation"] = "flash_attention_2"
+        pipe = pipeline(
+            task="text-generation",
+            model=repo,
+            tokenizer=tokenizer,
+            trust_remote_code=True,
+            device_map="auto",
                 model_kwargs=model_kwargs,
+            use_cache=True,
                 token=HF_TOKEN,
                 torch_dtype=torch.bfloat16,
             )
             except Exception:
                 pass
+        PIPELINES[model_name] = pipe
             _schedule_background_warm(model_name)
+        return pipe
+    except Exception as exc:
+        print(f"8-bit load failed for {repo}: {exc}. Falling back to higher precision.")
     # Fallback to bfloat16/fp16/fp32
     for dtype in (torch.bfloat16, torch.float16, torch.float32):
             thread = Thread(target=_generate)
             thread.start()
+        # Stream tokens
             completion = ""
             parsed_plan: Dict[str, Any] | None = None
             validation_msg = "🔄 Generating..."
                 if finished:
                     completion = chunk
+                break
             # Final processing after streaming completes
             thread.join()
     """) as demo:
         gr.Markdown("# 🛰️ Router Control Room — ZeroGPU" )
         gr.Markdown(description)
+    with gr.Row():
+        with gr.Column(scale=3):
                 user_task = gr.Textbox(
                     label="User Task / Problem Statement",
                     placeholder="Describe the homework-style query that needs routing...",
         generate_btn = gr.Button("Generate Router Plan", variant="primary")
         clear_btn = gr.Button("Clear", variant="secondary")
+            with gr.Row():
             raw_output = gr.Textbox(label="Raw Model Output", lines=12)
             plan_json = gr.JSON(label="Parsed Router Plan")
         validation_msg = gr.Markdown("Awaiting generation.")