Spaces:
Sleeping
Sleeping
gary-boon
Claude Opus 4.5
commited on
Commit
·
8d85da8
1
Parent(s):
cb6f39c
fix: Add chat template support for Devstral instruct model
Browse files- Add uses_chat_template field to model configs
- Apply tokenizer.apply_chat_template() for instruct models
- Fallback to manual [INST] format if needed
- Lower temperature to 0.15 for instruct models (better code output)
Fixes garbage token generation when using Devstral without proper
instruction formatting.
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
- backend/model_config.py +7 -3
- backend/model_service.py +29 -1
backend/model_config.py
CHANGED
|
@@ -23,6 +23,7 @@ class ModelConfig(TypedDict):
|
|
| 23 |
min_vram_gb: float
|
| 24 |
min_ram_gb: float
|
| 25 |
recommended_dtype: str # "fp16", "bf16", or "fp32"
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
# Supported models registry
|
|
@@ -41,7 +42,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 41 |
"requires_gpu": False,
|
| 42 |
"min_vram_gb": 2.0,
|
| 43 |
"min_ram_gb": 4.0,
|
| 44 |
-
"recommended_dtype": "fp16" # fp16 for GPU, fp32 for CPU
|
|
|
|
| 45 |
},
|
| 46 |
"code-llama-7b": {
|
| 47 |
"hf_path": "codellama/CodeLlama-7b-hf",
|
|
@@ -57,7 +59,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 57 |
"requires_gpu": True, # Strongly recommended for usable performance
|
| 58 |
"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
|
| 59 |
"min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
|
| 60 |
-
"recommended_dtype": "fp16"
|
|
|
|
| 61 |
},
|
| 62 |
"devstral-small": {
|
| 63 |
"hf_path": "mistralai/Devstral-Small-2507",
|
|
@@ -73,7 +76,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
|
|
| 73 |
"requires_gpu": True, # BF16 required, GPU strongly recommended
|
| 74 |
"min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
|
| 75 |
"min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
|
| 76 |
-
"recommended_dtype": "bf16" # Devstral requires bfloat16
|
|
|
|
| 77 |
}
|
| 78 |
}
|
| 79 |
|
|
|
|
| 23 |
min_vram_gb: float
|
| 24 |
min_ram_gb: float
|
| 25 |
recommended_dtype: str # "fp16", "bf16", or "fp32"
|
| 26 |
+
uses_chat_template: bool # Whether model expects instruction format
|
| 27 |
|
| 28 |
|
| 29 |
# Supported models registry
|
|
|
|
| 42 |
"requires_gpu": False,
|
| 43 |
"min_vram_gb": 2.0,
|
| 44 |
"min_ram_gb": 4.0,
|
| 45 |
+
"recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
|
| 46 |
+
"uses_chat_template": False # Base model, raw completion
|
| 47 |
},
|
| 48 |
"code-llama-7b": {
|
| 49 |
"hf_path": "codellama/CodeLlama-7b-hf",
|
|
|
|
| 59 |
"requires_gpu": True, # Strongly recommended for usable performance
|
| 60 |
"min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
|
| 61 |
"min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
|
| 62 |
+
"recommended_dtype": "fp16",
|
| 63 |
+
"uses_chat_template": False # Base model, raw completion
|
| 64 |
},
|
| 65 |
"devstral-small": {
|
| 66 |
"hf_path": "mistralai/Devstral-Small-2507",
|
|
|
|
| 76 |
"requires_gpu": True, # BF16 required, GPU strongly recommended
|
| 77 |
"min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
|
| 78 |
"min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
|
| 79 |
+
"recommended_dtype": "bf16", # Devstral requires bfloat16
|
| 80 |
+
"uses_chat_template": True # Instruction-tuned, requires chat format
|
| 81 |
}
|
| 82 |
}
|
| 83 |
|
backend/model_service.py
CHANGED
|
@@ -1460,8 +1460,36 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
|
|
| 1460 |
|
| 1461 |
logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
|
| 1462 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1463 |
# Tokenize and prepare
|
| 1464 |
-
inputs = manager.tokenizer(
|
| 1465 |
prompt_length = inputs["input_ids"].shape[1]
|
| 1466 |
prompt_token_ids = inputs["input_ids"][0].tolist()
|
| 1467 |
prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]
|
|
|
|
| 1460 |
|
| 1461 |
logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
|
| 1462 |
|
| 1463 |
+
# Check if model uses chat template (instruct models like Devstral)
|
| 1464 |
+
from .model_config import get_model_config
|
| 1465 |
+
model_config = get_model_config(manager.model_id)
|
| 1466 |
+
uses_chat_template = model_config.get("uses_chat_template", False) if model_config else False
|
| 1467 |
+
|
| 1468 |
+
# Format prompt for chat/instruct models
|
| 1469 |
+
if uses_chat_template:
|
| 1470 |
+
if hasattr(manager.tokenizer, 'apply_chat_template'):
|
| 1471 |
+
# Use tokenizer's built-in chat template
|
| 1472 |
+
messages = [{"role": "user", "content": f"Complete the following code:\n{prompt}"}]
|
| 1473 |
+
formatted_prompt = manager.tokenizer.apply_chat_template(
|
| 1474 |
+
messages,
|
| 1475 |
+
tokenize=False,
|
| 1476 |
+
add_generation_prompt=True
|
| 1477 |
+
)
|
| 1478 |
+
logger.info(f"Applied chat template for {manager.model_id}")
|
| 1479 |
+
else:
|
| 1480 |
+
# Fallback: Manual Mistral-style instruction format
|
| 1481 |
+
formatted_prompt = f"[INST] Complete the following code:\n{prompt} [/INST]"
|
| 1482 |
+
logger.info(f"Applied manual instruction format for {manager.model_id}")
|
| 1483 |
+
# Use lower temperature for instruct models (more deterministic code)
|
| 1484 |
+
if temperature > 0.3:
|
| 1485 |
+
temperature = 0.15
|
| 1486 |
+
logger.info(f"Adjusted temperature to {temperature} for instruct model")
|
| 1487 |
+
else:
|
| 1488 |
+
# Base model - use raw prompt
|
| 1489 |
+
formatted_prompt = prompt
|
| 1490 |
+
|
| 1491 |
# Tokenize and prepare
|
| 1492 |
+
inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
|
| 1493 |
prompt_length = inputs["input_ids"].shape[1]
|
| 1494 |
prompt_token_ids = inputs["input_ids"][0].tolist()
|
| 1495 |
prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]
|