Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude Opus 4.5 commited on 10 days ago

Commit

6bf9f5c

1 Parent(s): f94a7ae

feat: Implement tier-based model filtering by device type

Add min_device field to model configs to specify minimum hardware requirement:
- CodeGen 350M: min_device="cpu" (available to all users)
- Code Llama 7B: min_device="gpu" (GPU users only)
- Devstral Small 24B: min_device="gpu" (GPU users only)

Update /models endpoint to filter models based on backend hardware:
- GPU backends return all 3 models
- CPU backends only return CodeGen 350M

This ensures users only see models their backend can actually serve,
preventing confusion and failed requests.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

backend/model_config.py +6 -1
backend/model_service.py +49 -40

backend/model_config.py CHANGED Viewed

@@ -20,6 +20,7 @@ class ModelConfig(TypedDict):
     context_length: int
     attention_type: str  # "multi_head" or "grouped_query"
     requires_gpu: bool
     min_vram_gb: float
     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
@@ -43,6 +44,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "context_length": 2048,
         "attention_type": "multi_head",
         "requires_gpu": False,
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
         "recommended_dtype": "fp16",  # fp16 for GPU, fp32 for CPU
@@ -63,6 +65,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "context_length": 16384,
         "attention_type": "grouped_query",
         "requires_gpu": True,  # Strongly recommended for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
         "recommended_dtype": "fp16",
@@ -83,6 +86,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "context_length": 131072,
         "attention_type": "grouped_query",
         "requires_gpu": True,  # BF16 required, GPU strongly recommended
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
         "recommended_dtype": "bf16",  # Devstral requires bfloat16
@@ -152,6 +156,7 @@ def list_all_models() -> List[Dict[str, any]]:
             "attention_type": config["attention_type"],
             "num_layers": config["num_layers"],
             "num_heads": config["num_heads"],
-            "requires_gpu": config["requires_gpu"]
         })
     return models

     context_length: int
     attention_type: str  # "multi_head" or "grouped_query"
     requires_gpu: bool
+    min_device: str  # "cpu" or "gpu" - minimum device required to run this model
     min_vram_gb: float
     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
         "context_length": 2048,
         "attention_type": "multi_head",
         "requires_gpu": False,
+        "min_device": "cpu",  # Can run on CPU or GPU
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
         "recommended_dtype": "fp16",  # fp16 for GPU, fp32 for CPU
         "context_length": 16384,
         "attention_type": "grouped_query",
         "requires_gpu": True,  # Strongly recommended for usable performance
+        "min_device": "gpu",  # Requires GPU for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
         "recommended_dtype": "fp16",
         "context_length": 131072,
         "attention_type": "grouped_query",
         "requires_gpu": True,  # BF16 required, GPU strongly recommended
+        "min_device": "gpu",  # Requires GPU - unusable on CPU
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
         "recommended_dtype": "bf16",  # Devstral requires bfloat16
             "attention_type": config["attention_type"],
             "num_layers": config["num_layers"],
             "num_heads": config["num_heads"],
+            "requires_gpu": config["requires_gpu"],
+            "min_device": config.get("min_device", "cpu")
         })
     return models

backend/model_service.py CHANGED Viewed

@@ -1015,43 +1015,52 @@ async def debug_device():
 @app.get("/models")
 async def list_models():
-    """List all available models this backend can serve.
-    Returns model metadata including availability based on current hardware.
     Used by frontend to populate model selector dynamically.
     """
     from .model_config import SUPPORTED_MODELS
     # Check current device capabilities
     has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
     available_vram = 0
     if has_gpu and torch.cuda.is_available():
         available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB
     models = []
     for model_id, config in SUPPORTED_MODELS.items():
-        # Determine if model is available on current hardware
-        is_available = True
-        if config["requires_gpu"] and not has_gpu:
-            is_available = False
-        elif has_gpu and available_vram < config["min_vram_gb"]:
-            is_available = False
-        models.append({
-            "id": model_id,
-            "name": config["display_name"],
-            "size": config["size"],
-            "architecture": config["architecture"],
-            "num_layers": config["num_layers"],
-            "num_heads": config["num_heads"],
-            "vocab_size": config["vocab_size"],
-            "context_length": config["context_length"],
-            "attention_type": config["attention_type"],
-            "requires_gpu": config["requires_gpu"],
-            "available": is_available
-        })
-    return {"models": models}
 @app.get("/models/current")
@@ -1149,34 +1158,34 @@ async def model_info(authenticated: bool = Depends(verify_api_key)):
 @app.get("/models")
 async def get_models(authenticated: bool = Depends(verify_api_key)):
-    """Get list of available models filtered by current hardware"""
     from .model_config import list_all_models, SUPPORTED_MODELS
     # Get current device type
-    device_type = "cpu"
-    if torch.cuda.is_available():
-        device_type = "cuda"
-    elif torch.backends.mps.is_available():
-        device_type = "mps"
     all_models = list_all_models()
-    # Filter models based on hardware capabilities
     available_models = []
     for model in all_models:
         model_config = SUPPORTED_MODELS.get(model['id'])
-        # Check if model requires GPU but we're on CPU
-        if model_config and model_config['requires_gpu'] and device_type == "cpu":
-            # Skip GPU-only models when on CPU
-            continue
-        # Model is available on this hardware
-        model['available'] = True
-        model['is_current'] = (model['id'] == manager.model_id)
-        available_models.append(model)
-    return {"models": available_models}
 @app.get("/models/current")
 async def get_current_model(authenticated: bool = Depends(verify_api_key)):

 @app.get("/models")
 async def list_models():
+    """List available models this backend can serve based on hardware.
+    Filters models by min_device requirement:
+    - GPU backends return all models (can run CPU and GPU models)
+    - CPU backends only return models with min_device="cpu"
     Used by frontend to populate model selector dynamically.
     """
     from .model_config import SUPPORTED_MODELS
     # Check current device capabilities
     has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
+    device_type = "gpu" if has_gpu else "cpu"
     available_vram = 0
     if has_gpu and torch.cuda.is_available():
         available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)  # GB
     models = []
     for model_id, config in SUPPORTED_MODELS.items():
+        model_min_device = config.get("min_device", "cpu")
+        # GPU backends can run all models
+        # CPU backends can only run CPU models
+        if device_type == "gpu" or model_min_device == "cpu":
+            # Check VRAM requirements for GPU models
+            is_available = True
+            if has_gpu and available_vram > 0 and available_vram < config["min_vram_gb"]:
+                is_available = False
+            models.append({
+                "id": model_id,
+                "name": config["display_name"],
+                "size": config["size"],
+                "architecture": config["architecture"],
+                "num_layers": config["num_layers"],
+                "num_heads": config["num_heads"],
+                "vocab_size": config["vocab_size"],
+                "context_length": config["context_length"],
+                "attention_type": config["attention_type"],
+                "requires_gpu": config["requires_gpu"],
+                "min_device": model_min_device,
+                "available": is_available
+            })
+    return {"models": models, "device": device_type}
 @app.get("/models/current")
 @app.get("/models")
 async def get_models(authenticated: bool = Depends(verify_api_key)):
+    """Get list of available models filtered by current hardware.
+    Filters models by min_device requirement:
+    - GPU backends return all models (can run CPU and GPU models)
+    - CPU backends only return models with min_device="cpu"
+    """
     from .model_config import list_all_models, SUPPORTED_MODELS
     # Get current device type
+    has_gpu = torch.cuda.is_available() or torch.backends.mps.is_available()
+    device_type = "gpu" if has_gpu else "cpu"
     all_models = list_all_models()
+    # Filter models based on min_device requirement
     available_models = []
     for model in all_models:
         model_config = SUPPORTED_MODELS.get(model['id'])
+        model_min_device = model_config.get("min_device", "cpu") if model_config else "cpu"
+        # GPU backends can run all models
+        # CPU backends can only run CPU models
+        if device_type == "gpu" or model_min_device == "cpu":
+            model['available'] = True
+            model['is_current'] = (model['id'] == manager.model_id)
+            available_models.append(model)
+    return {"models": available_models, "device": device_type}
 @app.get("/models/current")
 async def get_current_model(authenticated: bool = Depends(verify_api_key)):