Spaces:
Sleeping
Sleeping
gary-boon
Claude Opus 4.5
commited on
Commit
·
6bf9f5c
1
Parent(s):
f94a7ae
feat: Implement tier-based model filtering by device type
Browse filesAdd min_device field to model configs to specify minimum hardware requirement:
- CodeGen 350M: min_device="cpu" (available to all users)
- Code Llama 7B: min_device="gpu" (GPU users only)
- Devstral Small 24B: min_device="gpu" (GPU users only)
Update /models endpoint to filter models based on backend hardware:
- GPU backends return all 3 models
- CPU backends only return CodeGen 350M
This ensures users only see models their backend can actually serve,
preventing confusion and failed requests.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- backend/model_config.py +6 -1
- backend/model_service.py +49 -40
backend/model_config.py
CHANGED
|
@@ -20,6 +20,7 @@ class ModelConfig(TypedDict):
|
|
| 20 |
context_length: int
|
| 21 |
attention_type: str # "multi_head" or "grouped_query"
|
| 22 |
requires_gpu: bool
|
|
|
|
| 23 |
min_vram_gb: float
|
| 24 |
min_ram_gb: float
|
| 25 |
recommended_dtype: str # "fp16", "bf16", or "fp32"
|
|
@@ -43,6 +44,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 43 |
"context_length": 2048,
|
| 44 |
"attention_type": "multi_head",
|
| 45 |
"requires_gpu": False,
|
|
|
|
| 46 |
"min_vram_gb": 2.0,
|
| 47 |
"min_ram_gb": 4.0,
|
| 48 |
"recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
|
|
@@ -63,6 +65,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 63 |
"context_length": 16384,
|
| 64 |
"attention_type": "grouped_query",
|
| 65 |
"requires_gpu": True, # Strongly recommended for usable performance
|
|
|
|
| 66 |
"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
|
| 67 |
"min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
|
| 68 |
"recommended_dtype": "fp16",
|
|
@@ -83,6 +86,7 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 83 |
"context_length": 131072,
|
| 84 |
"attention_type": "grouped_query",
|
| 85 |
"requires_gpu": True, # BF16 required, GPU strongly recommended
|
|
|
|
| 86 |
"min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
|
| 87 |
"min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
|
| 88 |
"recommended_dtype": "bf16", # Devstral requires bfloat16
|
|
@@ -152,6 +156,7 @@ def list_all_models() -> List[Dict[str, any]]:
|
|
| 152 |
"attention_type": config["attention_type"],
|
| 153 |
"num_layers": config["num_layers"],
|
| 154 |
"num_heads": config["num_heads"],
|
| 155 |
-
"requires_gpu": config["requires_gpu"]
|
|
|
|
| 156 |
})
|
| 157 |
return models
|
|
|
|
| 20 |
context_length: int
|
| 21 |
attention_type: str # "multi_head" or "grouped_query"
|
| 22 |
requires_gpu: bool
|
| 23 |
+
min_device: str # "cpu" or "gpu" - minimum device required to run this model
|
| 24 |
min_vram_gb: float
|
| 25 |
min_ram_gb: float
|
| 26 |
recommended_dtype: str # "fp16", "bf16", or "fp32"
|
|
|
|
| 44 |
"context_length": 2048,
|
| 45 |
"attention_type": "multi_head",
|
| 46 |
"requires_gpu": False,
|
| 47 |
+
"min_device": "cpu", # Can run on CPU or GPU
|
| 48 |
"min_vram_gb": 2.0,
|
| 49 |
"min_ram_gb": 4.0,
|
| 50 |
"recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
|
|
|
|
| 65 |
"context_length": 16384,
|
| 66 |
"attention_type": "grouped_query",
|
| 67 |
"requires_gpu": True, # Strongly recommended for usable performance
|
| 68 |
+
"min_device": "gpu", # Requires GPU for usable performance
|
| 69 |
"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
|
| 70 |
"min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
|
| 71 |
"recommended_dtype": "fp16",
|
|
|
|
| 86 |
"context_length": 131072,
|
| 87 |
"attention_type": "grouped_query",
|
| 88 |
"requires_gpu": True, # BF16 required, GPU strongly recommended
|
| 89 |
+
"min_device": "gpu", # Requires GPU - unusable on CPU
|
| 90 |
"min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
|
| 91 |
"min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
|
| 92 |
"recommended_dtype": "bf16", # Devstral requires bfloat16
|
|
|
|
| 156 |
"attention_type": config["attention_type"],
|
| 157 |
"num_layers": config["num_layers"],
|
| 158 |
"num_heads": config["num_heads"],
|
| 159 |
+
"requires_gpu": config["requires_gpu"],
|
| 160 |
+
"min_device": config.get("min_device", "cpu")
|
| 161 |
})
|
| 162 |
return models
|
backend/model_service.py
CHANGED
|
@@ -1015,43 +1015,52 @@ async def debug_device():
|
|
| 1015 |
|
| 1016 |
@app.get("/models")
|
| 1017 |
async def list_models():
|
| 1018 |
-
"""List
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1019 |
|
| 1020 |
-
Returns model metadata including availability based on current hardware.
|
| 1021 |
Used by frontend to populate model selector dynamically.
|
| 1022 |
"""
|
| 1023 |
from .model_config import SUPPORTED_MODELS
|
| 1024 |
|
| 1025 |
# Check current device capabilities
|
| 1026 |
has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
|
|
|
|
|
|
|
| 1027 |
available_vram = 0
|
| 1028 |
if has_gpu and torch.cuda.is_available():
|
| 1029 |
available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
|
| 1030 |
|
| 1031 |
models = []
|
| 1032 |
for model_id, config in SUPPORTED_MODELS.items():
|
| 1033 |
-
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
|
| 1038 |
-
|
| 1039 |
-
|
| 1040 |
-
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
| 1044 |
-
|
| 1045 |
-
|
| 1046 |
-
|
| 1047 |
-
|
| 1048 |
-
|
| 1049 |
-
|
| 1050 |
-
|
| 1051 |
-
|
| 1052 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
|
| 1054 |
-
return {"models": models}
|
| 1055 |
|
| 1056 |
|
| 1057 |
@app.get("/models/current")
|
|
@@ -1149,34 +1158,34 @@ async def model_info(authenticated: bool = Depends(verify_api_key)):
|
|
| 1149 |
|
| 1150 |
@app.get("/models")
|
| 1151 |
async def get_models(authenticated: bool = Depends(verify_api_key)):
|
| 1152 |
-
"""Get list of available models filtered by current hardware
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1153 |
from .model_config import list_all_models, SUPPORTED_MODELS
|
| 1154 |
|
| 1155 |
# Get current device type
|
| 1156 |
-
|
| 1157 |
-
if
|
| 1158 |
-
device_type = "cuda"
|
| 1159 |
-
elif torch.backends.mps.is_available():
|
| 1160 |
-
device_type = "mps"
|
| 1161 |
|
| 1162 |
all_models = list_all_models()
|
| 1163 |
|
| 1164 |
-
# Filter models based on
|
| 1165 |
available_models = []
|
| 1166 |
for model in all_models:
|
| 1167 |
model_config = SUPPORTED_MODELS.get(model['id'])
|
|
|
|
| 1168 |
|
| 1169 |
-
#
|
| 1170 |
-
|
| 1171 |
-
|
| 1172 |
-
|
| 1173 |
-
|
| 1174 |
-
|
| 1175 |
-
model['available'] = True
|
| 1176 |
-
model['is_current'] = (model['id'] == manager.model_id)
|
| 1177 |
-
available_models.append(model)
|
| 1178 |
|
| 1179 |
-
return {"models": available_models}
|
| 1180 |
|
| 1181 |
@app.get("/models/current")
|
| 1182 |
async def get_current_model(authenticated: bool = Depends(verify_api_key)):
|
|
|
|
| 1015 |
|
| 1016 |
@app.get("/models")
|
| 1017 |
async def list_models():
|
| 1018 |
+
"""List available models this backend can serve based on hardware.
|
| 1019 |
+
|
| 1020 |
+
Filters models by min_device requirement:
|
| 1021 |
+
- GPU backends return all models (can run CPU and GPU models)
|
| 1022 |
+
- CPU backends only return models with min_device="cpu"
|
| 1023 |
|
|
|
|
| 1024 |
Used by frontend to populate model selector dynamically.
|
| 1025 |
"""
|
| 1026 |
from .model_config import SUPPORTED_MODELS
|
| 1027 |
|
| 1028 |
# Check current device capabilities
|
| 1029 |
has_gpu = manager.device is not None and manager.device.type in ["cuda", "mps"]
|
| 1030 |
+
device_type = "gpu" if has_gpu else "cpu"
|
| 1031 |
+
|
| 1032 |
available_vram = 0
|
| 1033 |
if has_gpu and torch.cuda.is_available():
|
| 1034 |
available_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3) # GB
|
| 1035 |
|
| 1036 |
models = []
|
| 1037 |
for model_id, config in SUPPORTED_MODELS.items():
|
| 1038 |
+
model_min_device = config.get("min_device", "cpu")
|
| 1039 |
+
|
| 1040 |
+
# GPU backends can run all models
|
| 1041 |
+
# CPU backends can only run CPU models
|
| 1042 |
+
if device_type == "gpu" or model_min_device == "cpu":
|
| 1043 |
+
# Check VRAM requirements for GPU models
|
| 1044 |
+
is_available = True
|
| 1045 |
+
if has_gpu and available_vram > 0 and available_vram < config["min_vram_gb"]:
|
| 1046 |
+
is_available = False
|
| 1047 |
+
|
| 1048 |
+
models.append({
|
| 1049 |
+
"id": model_id,
|
| 1050 |
+
"name": config["display_name"],
|
| 1051 |
+
"size": config["size"],
|
| 1052 |
+
"architecture": config["architecture"],
|
| 1053 |
+
"num_layers": config["num_layers"],
|
| 1054 |
+
"num_heads": config["num_heads"],
|
| 1055 |
+
"vocab_size": config["vocab_size"],
|
| 1056 |
+
"context_length": config["context_length"],
|
| 1057 |
+
"attention_type": config["attention_type"],
|
| 1058 |
+
"requires_gpu": config["requires_gpu"],
|
| 1059 |
+
"min_device": model_min_device,
|
| 1060 |
+
"available": is_available
|
| 1061 |
+
})
|
| 1062 |
|
| 1063 |
+
return {"models": models, "device": device_type}
|
| 1064 |
|
| 1065 |
|
| 1066 |
@app.get("/models/current")
|
|
|
|
| 1158 |
|
| 1159 |
@app.get("/models")
|
| 1160 |
async def get_models(authenticated: bool = Depends(verify_api_key)):
|
| 1161 |
+
"""Get list of available models filtered by current hardware.
|
| 1162 |
+
|
| 1163 |
+
Filters models by min_device requirement:
|
| 1164 |
+
- GPU backends return all models (can run CPU and GPU models)
|
| 1165 |
+
- CPU backends only return models with min_device="cpu"
|
| 1166 |
+
"""
|
| 1167 |
from .model_config import list_all_models, SUPPORTED_MODELS
|
| 1168 |
|
| 1169 |
# Get current device type
|
| 1170 |
+
has_gpu = torch.cuda.is_available() or torch.backends.mps.is_available()
|
| 1171 |
+
device_type = "gpu" if has_gpu else "cpu"
|
|
|
|
|
|
|
|
|
|
| 1172 |
|
| 1173 |
all_models = list_all_models()
|
| 1174 |
|
| 1175 |
+
# Filter models based on min_device requirement
|
| 1176 |
available_models = []
|
| 1177 |
for model in all_models:
|
| 1178 |
model_config = SUPPORTED_MODELS.get(model['id'])
|
| 1179 |
+
model_min_device = model_config.get("min_device", "cpu") if model_config else "cpu"
|
| 1180 |
|
| 1181 |
+
# GPU backends can run all models
|
| 1182 |
+
# CPU backends can only run CPU models
|
| 1183 |
+
if device_type == "gpu" or model_min_device == "cpu":
|
| 1184 |
+
model['available'] = True
|
| 1185 |
+
model['is_current'] = (model['id'] == manager.model_id)
|
| 1186 |
+
available_models.append(model)
|
|
|
|
|
|
|
|
|
|
| 1187 |
|
| 1188 |
+
return {"models": available_models, "device": device_type}
|
| 1189 |
|
| 1190 |
@app.get("/models/current")
|
| 1191 |
async def get_current_model(authenticated: bool = Depends(verify_api_key)):
|