gary-boon Claude Opus 4.5 commited on
Commit
6bf9f5c
·
1 Parent(s): f94a7ae

feat: Implement tier-based model filtering by device type

Browse files

Add min_device field to model configs to specify minimum hardware requirement:
- CodeGen 350M: min_device="cpu" (available to all users)
- Code Llama 7B: min_device="gpu" (GPU users only)
- Devstral Small 24B: min_device="gpu" (GPU users only)

Update /models endpoint to filter models based on backend hardware:
- GPU backends return all 3 models
- CPU backends only return CodeGen 350M

This ensures users only see models their backend can actually serve,
preventing confusion and failed requests.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. backend/model_config.py +6 -1
  2. backend/model_service.py +49 -40
backend/model_config.py CHANGED
@@ -20,6 +20,7 @@ class ModelConfig(TypedDict):
20
  context_length: int
21
  attention_type: str # "multi_head" or "grouped_query"
22
  requires_gpu: bool
 
23
  min_vram_gb: float
24
  min_ram_gb: float
25
  recommended_dtype: str # "fp16", "bf16", or "fp32"
@@ -43,6 +44,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
43
  "context_length": 2048,
44
  "attention_type": "multi_head",
45
  "requires_gpu": False,
 
46
  "min_vram_gb": 2.0,
47
  "min_ram_gb": 4.0,
48
  "recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
@@ -63,6 +65,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
63
  "context_length": 16384,
64
  "attention_type": "grouped_query",
65
  "requires_gpu": True, # Strongly recommended for usable performance
 
66
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
67
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
68
  "recommended_dtype": "fp16",
@@ -83,6 +86,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
83
  "context_length": 131072,
84
  "attention_type": "grouped_query",
85
  "requires_gpu": True, # BF16 required, GPU strongly recommended
 
86
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
87
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
88
  "recommended_dtype": "bf16", # Devstral requires bfloat16
@@ -152,6 +156,7 @@ def list_all_models() -> List[Dict[str, any]]:
152
  "attention_type": config["attention_type"],
153
  "num_layers": config["num_layers"],
154
  "num_heads": config["num_heads"],
155
- "requires_gpu": config["requires_gpu"]
 
156
  })
157
  return models
 
20
  context_length: int
21
  attention_type: str # "multi_head" or "grouped_query"
22
  requires_gpu: bool
23
+ min_device: str # "cpu" or "gpu" - minimum device required to run this model
24
  min_vram_gb: float
25
  min_ram_gb: float
26
  recommended_dtype: str # "fp16", "bf16", or "fp32"
 
44
  "context_length": 2048,
45
  "attention_type": "multi_head",
46
  "requires_gpu": False,
47
+ "min_device": "cpu", # Can run on CPU or GPU
48
  "min_vram_gb": 2.0,
49
  "min_ram_gb": 4.0,
50
  "recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
 
65
  "context_length": 16384,
66
  "attention_type": "grouped_query",
67
  "requires_gpu": True, # Strongly recommended for usable performance
68
+ "min_device": "gpu", # Requires GPU for usable performance
69
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
70
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
71
  "recommended_dtype": "fp16",
 
86
  "context_length": 131072,
87
  "attention_type": "grouped_query",
88
  "requires_gpu": True, # BF16 required, GPU strongly recommended
89
+ "min_device": "gpu", # Requires GPU - unusable on CPU
90
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
91
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
92
  "recommended_dtype": "bf16", # Devstral requires bfloat16
 
156
  "attention_type": config["attention_type"],
157
  "num_layers": config["num_layers"],
158
  "num_heads": config["num_heads"],
159
+ "requires_gpu": config["requires_gpu"],
160
+ "min_device": config.get("min_device", "cpu")
161
  })
162
  return models
backend/model_service.py CHANGED
@@ -1015,43 +1015,52 @@ async def debug_device():
1015
 
1016
  @app.get("/models")
1017
  async def list_models():
1018
- """List all available models this backend can serve.
 
 
 
 
1019
 
1020
- Returns model metadata including availability based on current hardware.
1021
  Used by frontend to populate model selector dynamically.
1022
  """
1023
  from .model_config import SUPPORTED_MODELS
1024
 
1025
  # Check current device capabilities
1026
  has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
 
 
1027
  available_vram = 0
1028
  if has_gpu and torch.cuda.is_available():
1029
  available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
1030
 
1031
  models = []
1032
  for model_id, config in SUPPORTED_MODELS.items():
1033
- # Determine if model is available on current hardware
1034
- is_available = True
1035
- if config["requires_gpu"] and not has_gpu:
1036
- is_available = False
1037
- elif has_gpu and available_vram < config["min_vram_gb"]:
1038
- is_available = False
1039
-
1040
- models.append({
1041
- "id": model_id,
1042
- "name": config["display_name"],
1043
- "size": config["size"],
1044
- "architecture": config["architecture"],
1045
- "num_layers": config["num_layers"],
1046
- "num_heads": config["num_heads"],
1047
- "vocab_size": config["vocab_size"],
1048
- "context_length": config["context_length"],
1049
- "attention_type": config["attention_type"],
1050
- "requires_gpu": config["requires_gpu"],
1051
- "available": is_available
1052
- })
 
 
 
 
1053
 
1054
- return {"models": models}
1055
 
1056
 
1057
  @app.get("/models/current")
@@ -1149,34 +1158,34 @@ async def model_info(authenticated: bool = Depends(verify_api_key)):
1149
 
1150
  @app.get("/models")
1151
  async def get_models(authenticated: bool = Depends(verify_api_key)):
1152
- """Get list of available models filtered by current hardware"""
 
 
 
 
 
1153
  from .model_config import list_all_models, SUPPORTED_MODELS
1154
 
1155
  # Get current device type
1156
- device_type = "cpu"
1157
- if torch.cuda.is_available():
1158
- device_type = "cuda"
1159
- elif torch.backends.mps.is_available():
1160
- device_type = "mps"
1161
 
1162
  all_models = list_all_models()
1163
 
1164
- # Filter models based on hardware capabilities
1165
  available_models = []
1166
  for model in all_models:
1167
  model_config = SUPPORTED_MODELS.get(model['id'])
 
1168
 
1169
- # Check if model requires GPU but we're on CPU
1170
- if model_config and model_config['requires_gpu'] and device_type == "cpu":
1171
- # Skip GPU-only models when on CPU
1172
- continue
1173
-
1174
- # Model is available on this hardware
1175
- model['available'] = True
1176
- model['is_current'] = (model['id'] == manager.model_id)
1177
- available_models.append(model)
1178
 
1179
- return {"models": available_models}
1180
 
1181
  @app.get("/models/current")
1182
  async def get_current_model(authenticated: bool = Depends(verify_api_key)):
 
1015
 
1016
  @app.get("/models")
1017
  async def list_models():
1018
+ """List available models this backend can serve based on hardware.
1019
+
1020
+ Filters models by min_device requirement:
1021
+ - GPU backends return all models (can run CPU and GPU models)
1022
+ - CPU backends only return models with min_device="cpu"
1023
 
 
1024
  Used by frontend to populate model selector dynamically.
1025
  """
1026
  from .model_config import SUPPORTED_MODELS
1027
 
1028
  # Check current device capabilities
1029
  has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
1030
+ device_type = "gpu" if has_gpu else "cpu"
1031
+
1032
  available_vram = 0
1033
  if has_gpu and torch.cuda.is_available():
1034
  available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
1035
 
1036
  models = []
1037
  for model_id, config in SUPPORTED_MODELS.items():
1038
+ model_min_device = config.get("min_device", "cpu")
1039
+
1040
+ # GPU backends can run all models
1041
+ # CPU backends can only run CPU models
1042
+ if device_type == "gpu" or model_min_device == "cpu":
1043
+ # Check VRAM requirements for GPU models
1044
+ is_available = True
1045
+ if has_gpu and available_vram > 0 and available_vram < config["min_vram_gb"]:
1046
+ is_available = False
1047
+
1048
+ models.append({
1049
+ "id": model_id,
1050
+ "name": config["display_name"],
1051
+ "size": config["size"],
1052
+ "architecture": config["architecture"],
1053
+ "num_layers": config["num_layers"],
1054
+ "num_heads": config["num_heads"],
1055
+ "vocab_size": config["vocab_size"],
1056
+ "context_length": config["context_length"],
1057
+ "attention_type": config["attention_type"],
1058
+ "requires_gpu": config["requires_gpu"],
1059
+ "min_device": model_min_device,
1060
+ "available": is_available
1061
+ })
1062
 
1063
+ return {"models": models, "device": device_type}
1064
 
1065
 
1066
  @app.get("/models/current")
 
1158
 
1159
  @app.get("/models")
1160
  async def get_models(authenticated: bool = Depends(verify_api_key)):
1161
+ """Get list of available models filtered by current hardware.
1162
+
1163
+ Filters models by min_device requirement:
1164
+ - GPU backends return all models (can run CPU and GPU models)
1165
+ - CPU backends only return models with min_device="cpu"
1166
+ """
1167
  from .model_config import list_all_models, SUPPORTED_MODELS
1168
 
1169
  # Get current device type
1170
+ has_gpu = torch.cuda.is_available() or torch.backends.mps.is_available()
1171
+ device_type = "gpu" if has_gpu else "cpu"
 
 
 
1172
 
1173
  all_models = list_all_models()
1174
 
1175
+ # Filter models based on min_device requirement
1176
  available_models = []
1177
  for model in all_models:
1178
  model_config = SUPPORTED_MODELS.get(model['id'])
1179
+ model_min_device = model_config.get("min_device", "cpu") if model_config else "cpu"
1180
 
1181
+ # GPU backends can run all models
1182
+ # CPU backends can only run CPU models
1183
+ if device_type == "gpu" or model_min_device == "cpu":
1184
+ model['available'] = True
1185
+ model['is_current'] = (model['id'] == manager.model_id)
1186
+ available_models.append(model)
 
 
 
1187
 
1188
+ return {"models": available_models, "device": device_type}
1189
 
1190
  @app.get("/models/current")
1191
  async def get_current_model(authenticated: bool = Depends(verify_api_key)):