Spaces:

visualisable-ai
/

api

Sleeping

gary-boon Claude Opus 4.5 commited on 7 days ago

Commit

8d85da8

1 Parent(s): cb6f39c

fix: Add chat template support for Devstral instruct model

- Add uses_chat_template field to model configs
- Apply tokenizer.apply_chat_template() for instruct models
- Fallback to manual [INST] format if needed
- Lower temperature to 0.15 for instruct models (better code output)

Fixes garbage token generation when using Devstral without proper
instruction formatting.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>

Files changed (2) hide show

backend/model_config.py +7 -3
backend/model_service.py +29 -1

backend/model_config.py CHANGED Viewed

@@ -23,6 +23,7 @@ class ModelConfig(TypedDict):
     min_vram_gb: float
     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
 # Supported models registry
@@ -41,7 +42,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "requires_gpu": False,
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
-        "recommended_dtype": "fp16"  # fp16 for GPU, fp32 for CPU
     },
     "code-llama-7b": {
         "hf_path": "codellama/CodeLlama-7b-hf",
@@ -57,7 +59,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "requires_gpu": True,  # Strongly recommended for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
-        "recommended_dtype": "fp16"
     },
     "devstral-small": {
         "hf_path": "mistralai/Devstral-Small-2507",
@@ -73,7 +76,8 @@ SUPPORTED_MODELS: Dict[str, ModelConfig] = {
         "requires_gpu": True,  # BF16 required, GPU strongly recommended
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
-        "recommended_dtype": "bf16"  # Devstral requires bfloat16
     }
 }

     min_vram_gb: float
     min_ram_gb: float
     recommended_dtype: str  # "fp16", "bf16", or "fp32"
+    uses_chat_template: bool  # Whether model expects instruction format
 # Supported models registry
         "requires_gpu": False,
         "min_vram_gb": 2.0,
         "min_ram_gb": 4.0,
+        "recommended_dtype": "fp16",  # fp16 for GPU, fp32 for CPU
+        "uses_chat_template": False  # Base model, raw completion
     },
     "code-llama-7b": {
         "hf_path": "codellama/CodeLlama-7b-hf",
         "requires_gpu": True,  # Strongly recommended for usable performance
         "min_vram_gb": 14.0,   # FP16 requires ~14GB VRAM
         "min_ram_gb": 18.0,    # FP16 requires ~18GB RAM for CPU fallback
+        "recommended_dtype": "fp16",
+        "uses_chat_template": False  # Base model, raw completion
     },
     "devstral-small": {
         "hf_path": "mistralai/Devstral-Small-2507",
         "requires_gpu": True,  # BF16 required, GPU strongly recommended
         "min_vram_gb": 48.0,   # BF16 requires ~48GB VRAM
         "min_ram_gb": 96.0,    # BF16 requires ~96GB RAM for CPU fallback
+        "recommended_dtype": "bf16",  # Devstral requires bfloat16
+        "uses_chat_template": True  # Instruction-tuned, requires chat format
     }
 }

backend/model_service.py CHANGED Viewed

@@ -1460,8 +1460,36 @@ async def analyze_research_attention(request: Dict[str, Any], authenticated: boo
         logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
         # Tokenize and prepare
-        inputs = manager.tokenizer(prompt, return_tensors="pt").to(manager.device)
         prompt_length = inputs["input_ids"].shape[1]
         prompt_token_ids = inputs["input_ids"][0].tolist()
         prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]

         logger.info(f"Research attention analysis: prompt_len={len(prompt)}, max_tokens={max_tokens}")
+        # Check if model uses chat template (instruct models like Devstral)
+        from .model_config import get_model_config
+        model_config = get_model_config(manager.model_id)
+        uses_chat_template = model_config.get("uses_chat_template", False) if model_config else False
+        # Format prompt for chat/instruct models
+        if uses_chat_template:
+            if hasattr(manager.tokenizer, 'apply_chat_template'):
+                # Use tokenizer's built-in chat template
+                messages = [{"role": "user", "content": f"Complete the following code:\n{prompt}"}]
+                formatted_prompt = manager.tokenizer.apply_chat_template(
+                    messages,
+                    tokenize=False,
+                    add_generation_prompt=True
+                )
+                logger.info(f"Applied chat template for {manager.model_id}")
+            else:
+                # Fallback: Manual Mistral-style instruction format
+                formatted_prompt = f"[INST] Complete the following code:\n{prompt} [/INST]"
+                logger.info(f"Applied manual instruction format for {manager.model_id}")
+            # Use lower temperature for instruct models (more deterministic code)
+            if temperature > 0.3:
+                temperature = 0.15
+                logger.info(f"Adjusted temperature to {temperature} for instruct model")
+        else:
+            # Base model - use raw prompt
+            formatted_prompt = prompt
         # Tokenize and prepare
+        inputs = manager.tokenizer(formatted_prompt, return_tensors="pt").to(manager.device)
         prompt_length = inputs["input_ids"].shape[1]
         prompt_token_ids = inputs["input_ids"][0].tolist()
         prompt_tokens = [manager.tokenizer.decode([tid], skip_special_tokens=False) for tid in prompt_token_ids]