Spaces:
Sleeping
Sleeping
| """ | |
| Mistral Tokenizer Wrapper | |
| Provides correct tokenization for Devstral using mistral-common library. | |
| The Tekken tokenizer used by Devstral is incompatible with HuggingFace's | |
| standard tokenization approach. This wrapper uses mistral-common to | |
| produce correct token sequences for the model. | |
| """ | |
| import logging | |
| from typing import List, Optional | |
| logger = logging.getLogger(__name__) | |
| class MistralTokenizerWrapper: | |
| """ | |
| Wrapper around mistral-common's MistralTokenizer for Devstral. | |
| Uses encode_chat_completion() to produce correct token IDs | |
| that the model actually expects, rather than HF's text-based approach | |
| which produces corrupted tokens for Tekken-based models. | |
| """ | |
| def __init__(self, model_name: str): | |
| """ | |
| Initialize the Mistral tokenizer from HuggingFace hub. | |
| Args: | |
| model_name: HuggingFace model path (e.g., "mistralai/Devstral-Small-2507") | |
| """ | |
| try: | |
| from mistral_common.tokens.tokenizers.mistral import MistralTokenizer | |
| self.tokenizer = MistralTokenizer.from_hf_hub(model_name) | |
| self._available = True | |
| logger.info(f"Loaded MistralTokenizer for {model_name}") | |
| except ImportError as e: | |
| logger.warning(f"mistral-common not available: {e}") | |
| self._available = False | |
| self.tokenizer = None | |
| except Exception as e: | |
| logger.error(f"Failed to load MistralTokenizer: {e}") | |
| self._available = False | |
| self.tokenizer = None | |
| def is_available(self) -> bool: | |
| """Check if the tokenizer was loaded successfully.""" | |
| return self._available | |
| def encode_chat( | |
| self, | |
| system_prompt: str, | |
| user_prompt: str | |
| ) -> List[int]: | |
| """ | |
| Encode chat messages to token IDs using mistral-common. | |
| This produces the correct token sequence for Devstral, including | |
| proper handling of control tokens like [INST] and [/INST]. | |
| Args: | |
| system_prompt: System message content | |
| user_prompt: User message content (e.g., "def quicksort(arr):") | |
| Returns: | |
| List of token IDs ready for model input | |
| """ | |
| if not self._available: | |
| raise RuntimeError("MistralTokenizer not available") | |
| from mistral_common.protocol.instruct.messages import ( | |
| SystemMessage, UserMessage | |
| ) | |
| from mistral_common.protocol.instruct.request import ChatCompletionRequest | |
| # Build messages list | |
| messages = [] | |
| if system_prompt: | |
| messages.append(SystemMessage(content=system_prompt)) | |
| messages.append(UserMessage(content=user_prompt)) | |
| # Encode using mistral-common's chat completion encoding | |
| request = ChatCompletionRequest(messages=messages) | |
| tokenized = self.tokenizer.encode_chat_completion(request) | |
| logger.info(f"Encoded chat: {len(tokenized.tokens)} tokens") | |
| return tokenized.tokens | |
| def decode(self, token_ids: List[int]) -> str: | |
| """ | |
| Decode token IDs back to text. | |
| Args: | |
| token_ids: List of token IDs to decode | |
| Returns: | |
| Decoded text string | |
| """ | |
| if not self._available: | |
| raise RuntimeError("MistralTokenizer not available") | |
| return self.tokenizer.decode(token_ids) | |
| def decode_token(self, token_id: int) -> str: | |
| """ | |
| Decode a single token ID to text. | |
| Args: | |
| token_id: Single token ID to decode | |
| Returns: | |
| Decoded text for this token | |
| """ | |
| if not self._available: | |
| raise RuntimeError("MistralTokenizer not available") | |
| return self.tokenizer.decode([token_id]) | |
| def create_mistral_tokenizer(model_name: str) -> Optional[MistralTokenizerWrapper]: | |
| """ | |
| Factory function to create a MistralTokenizerWrapper. | |
| Returns None if mistral-common is not available or loading fails. | |
| Args: | |
| model_name: HuggingFace model path | |
| Returns: | |
| MistralTokenizerWrapper instance or None | |
| """ | |
| wrapper = MistralTokenizerWrapper(model_name) | |
| if wrapper.is_available: | |
| return wrapper | |
| return None | |