gary-boon Claude Opus 4.5 commited on
Commit
8d85da8
·
1 Parent(s): cb6f39c

fix: Add chat template support for Devstral instruct model

Browse files

- Add uses_chat_template field to model configs
- Apply tokenizer.apply_chat_template() for instruct models
- Fallback to manual [INST] format if needed
- Lower temperature to 0.15 for instruct models (better code output)

Fixes garbage token generation when using Devstral without proper
instruction formatting.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

backend/model_config.py CHANGED
@@ -23,6 +23,7 @@ class ModelConfig(TypedDict):
23
  min_vram_gb: float
24
  min_ram_gb: float
25
  recommended_dtype: str # "fp16", "bf16", or "fp32"
 
26
 
27
 
28
  # Supported models registry
@@ -41,7 +42,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
41
  "requires_gpu": False,
42
  "min_vram_gb": 2.0,
43
  "min_ram_gb": 4.0,
44
- "recommended_dtype": "fp16" # fp16 for GPU, fp32 for CPU
 
45
  },
46
  "code-llama-7b": {
47
  "hf_path": "codellama/CodeLlama-7b-hf",
@@ -57,7 +59,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
57
  "requires_gpu": True, # Strongly recommended for usable performance
58
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
59
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
60
- "recommended_dtype": "fp16"
 
61
  },
62
  "devstral-small": {
63
  "hf_path": "mistralai/Devstral-Small-2507",
@@ -73,7 +76,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
73
  "requires_gpu": True, # BF16 required, GPU strongly recommended
74
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
75
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
76
- "recommended_dtype": "bf16" # Devstral requires bfloat16
 
77
  }
78
  }
79
 
 
23
  min_vram_gb: float
24
  min_ram_gb: float
25
  recommended_dtype: str # "fp16", "bf16", or "fp32"
26
+ uses_chat_template: bool # Whether model expects instruction format
27
 
28
 
29
  # Supported models registry
 
42
  "requires_gpu": False,
43
  "min_vram_gb": 2.0,
44
  "min_ram_gb": 4.0,
45
+ "recommended_dtype": "fp16", # fp16 for GPU, fp32 for CPU
46
+ "uses_chat_template": False # Base model, raw completion
47
  },
48
  "code-llama-7b": {
49
  "hf_path": "codellama/CodeLlama-7b-hf",
 
59
  "requires_gpu": True, # Strongly recommended for usable performance
60
  "min_vram_gb": 14.0, # FP16 requires ~14GB VRAM
61
  "min_ram_gb": 18.0, # FP16 requires ~18GB RAM for CPU fallback
62
+ "recommended_dtype": "fp16",
63
+ "uses_chat_template": False # Base model, raw completion
64
  },
65
  "devstral-small": {
66
  "hf_path": "mistralai/Devstral-Small-2507",
 
76
  "requires_gpu": True, # BF16 required, GPU strongly recommended
77
  "min_vram_gb": 48.0, # BF16 requires ~48GB VRAM
78
  "min_ram_gb": 96.0, # BF16 requires ~96GB RAM for CPU fallback
79
+ "recommended_dtype": "bf16", # Devstral requires bfloat16
80
+ "uses_chat_template": True # Instruction-tuned, requires chat format
81
  }
82
  }
83
 
backend/model_service.py CHANGED
@@ -1460,8 +1460,36 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
1460
 
1461
  logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
1462
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1463
  # Tokenize and prepare
1464
- inputs = manager.tokenizer(prompt, return_tensors="pt").to(manager.device)
1465
  prompt_length = inputs["input_ids"].shape[1]
1466
  prompt_token_ids = inputs["input_ids"][0].tolist()
1467
  prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]
 
1460
 
1461
  logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
1462
 
1463
+ # Check if model uses chat template (instruct models like Devstral)
1464
+ from .model_config import get_model_config
1465
+ model_config = get_model_config(manager.model_id)
1466
+ uses_chat_template = model_config.get("uses_chat_template", False) if model_config else False
1467
+
1468
+ # Format prompt for chat/instruct models
1469
+ if uses_chat_template:
1470
+ if hasattr(manager.tokenizer, 'apply_chat_template'):
1471
+ # Use tokenizer's built-in chat template
1472
+ messages = [{"role": "user", "content": f"Complete the following code:\n{prompt}"}]
1473
+ formatted_prompt = manager.tokenizer.apply_chat_template(
1474
+ messages,
1475
+ tokenize=False,
1476
+ add_generation_prompt=True
1477
+ )
1478
+ logger.info(f"Applied chat template for {manager.model_id}")
1479
+ else:
1480
+ # Fallback: Manual Mistral-style instruction format
1481
+ formatted_prompt = f"[INST] Complete the following code:\n{prompt} [/INST]"
1482
+ logger.info(f"Applied manual instruction format for {manager.model_id}")
1483
+ # Use lower temperature for instruct models (more deterministic code)
1484
+ if temperature > 0.3:
1485
+ temperature = 0.15
1486
+ logger.info(f"Adjusted temperature to {temperature} for instruct model")
1487
+ else:
1488
+ # Base model - use raw prompt
1489
+ formatted_prompt = prompt
1490
+
1491
  # Tokenize and prepare
1492
+ inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
1493
  prompt_length = inputs["input_ids"].shape[1]
1494
  prompt_token_ids = inputs["input_ids"][0].tolist()
1495
  prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]