tokenizers / app /services /tokenizer_service.py
bartar's picture
Upload 26 files
d66ab65 verified
"""
Tokenizer Service - Handles tokenizer loading, caching, and management
"""
import time
from typing import Dict, Tuple, Optional, Any
from transformers import AutoTokenizer
from flask import current_app
class TokenizerService:
"""Service for managing tokenizer loading and caching."""
# Predefined tokenizer models with aliases
TOKENIZER_MODELS = {
'qwen3': {
'name': 'Qwen/Qwen3-0.6B',
'alias': 'Qwen 3'
},
'gemma3-27b': {
'name': 'google/gemma-3-27b-it',
'alias': 'Gemma 3 27B'
},
'glm4': {
'name': 'THUDM/GLM-4-32B-0414',
'alias': 'GLM 4'
},
'mistral-small': {
'name': 'mistralai/Mistral-Small-3.1-24B-Instruct-2503',
'alias': 'Mistral Small 3.1'
},
'llama4': {
'name': 'meta-llama/Llama-4-Scout-17B-16E-Instruct',
'alias': 'Llama 4'
},
'deepseek-r1': {
'name': 'deepseek-ai/DeepSeek-R1',
'alias': 'Deepseek R1'
},
'qwen_25_72b': {
'name': 'Qwen/Qwen2.5-72B-Instruct',
'alias': 'QWQ 32B'
},
'llama_33': {
'name': 'unsloth/Llama-3.3-70B-Instruct-bnb-4bit',
'alias': 'Llama 3.3 70B'
},
'gemma2_2b': {
'name': 'google/gemma-2-2b-it',
'alias': 'Gemma 2 2B'
},
'bert-large-uncased': {
'name': 'google-bert/bert-large-uncased',
'alias': 'Bert Large Uncased'
},
'gpt2': {
'name': 'openai-community/gpt2',
'alias': 'GPT-2'
}
}
def __init__(self):
"""Initialize the tokenizer service with empty caches."""
self.tokenizers: Dict[str, Any] = {}
self.custom_tokenizers: Dict[str, Tuple[Any, float]] = {}
self.tokenizer_info_cache: Dict[str, Dict] = {}
self.custom_model_errors: Dict[str, str] = {}
def get_tokenizer_info(self, tokenizer) -> Dict:
"""Extract useful information from a tokenizer."""
info = {}
try:
# Get vocabulary size (dictionary size)
if hasattr(tokenizer, 'vocab_size'):
info['vocab_size'] = tokenizer.vocab_size
elif hasattr(tokenizer, 'get_vocab'):
info['vocab_size'] = len(tokenizer.get_vocab())
# Get model max length if available
if hasattr(tokenizer, 'model_max_length') and tokenizer.model_max_length < 1000000:
info['model_max_length'] = tokenizer.model_max_length
# Check tokenizer type
info['tokenizer_type'] = tokenizer.__class__.__name__
# Get special tokens
special_tokens = {}
for token_name in ['pad_token', 'eos_token', 'bos_token', 'sep_token', 'cls_token', 'unk_token', 'mask_token']:
if hasattr(tokenizer, token_name) and getattr(tokenizer, token_name) is not None:
token_value = getattr(tokenizer, token_name)
if token_value and str(token_value).strip():
special_tokens[token_name] = str(token_value)
info['special_tokens'] = special_tokens
except Exception as e:
info['error'] = f"Error extracting tokenizer info: {str(e)}"
return info
def load_tokenizer(self, model_id_or_name: str) -> Tuple[Optional[Any], Dict, Optional[str]]:
"""
Load tokenizer if not already loaded.
Returns:
Tuple of (tokenizer, tokenizer_info, error_message)
"""
error_message = None
tokenizer_info = {}
# Check if we have cached tokenizer info
if model_id_or_name in self.tokenizer_info_cache:
tokenizer_info = self.tokenizer_info_cache[model_id_or_name]
try:
# Check if it's a predefined model ID
if model_id_or_name in self.TOKENIZER_MODELS:
model_name = self.TOKENIZER_MODELS[model_id_or_name]['name']
if model_id_or_name not in self.tokenizers:
self.tokenizers[model_id_or_name] = AutoTokenizer.from_pretrained(model_name)
tokenizer = self.tokenizers[model_id_or_name]
# Get tokenizer info if not already cached
if model_id_or_name not in self.tokenizer_info_cache:
tokenizer_info = self.get_tokenizer_info(tokenizer)
self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
return tokenizer, tokenizer_info, None
# It's a custom model path
# Check if we have it in the custom cache and it's not expired
current_time = time.time()
cache_expiration = current_app.config.get('CACHE_EXPIRATION', 3600)
if model_id_or_name in self.custom_tokenizers:
cached_tokenizer, timestamp = self.custom_tokenizers[model_id_or_name]
if current_time - timestamp < cache_expiration:
# Get tokenizer info if not already cached
if model_id_or_name not in self.tokenizer_info_cache:
tokenizer_info = self.get_tokenizer_info(cached_tokenizer)
self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
return cached_tokenizer, tokenizer_info, None
# Not in cache or expired, load it
tokenizer = AutoTokenizer.from_pretrained(model_id_or_name)
# Store in cache with timestamp
self.custom_tokenizers[model_id_or_name] = (tokenizer, current_time)
# Clear any previous errors for this model
if model_id_or_name in self.custom_model_errors:
del self.custom_model_errors[model_id_or_name]
# Get tokenizer info
tokenizer_info = self.get_tokenizer_info(tokenizer)
self.tokenizer_info_cache[model_id_or_name] = tokenizer_info
return tokenizer, tokenizer_info, None
except Exception as e:
error_message = f"Failed to load tokenizer: {str(e)}"
# Store error for future reference
self.custom_model_errors[model_id_or_name] = error_message
return None, tokenizer_info, error_message
def get_model_alias(self, model_id: str) -> str:
"""Get the display alias for a model ID."""
if model_id in self.TOKENIZER_MODELS:
return self.TOKENIZER_MODELS[model_id]['alias']
return model_id
def is_predefined_model(self, model_id: str) -> bool:
"""Check if a model ID is a predefined model."""
return model_id in self.TOKENIZER_MODELS
def clear_cache(self):
"""Clear all caches."""
self.tokenizers.clear()
self.custom_tokenizers.clear()
self.tokenizer_info_cache.clear()
self.custom_model_errors.clear()
# Global instance
tokenizer_service = TokenizerService()