from transformers import AutoTokenizer import logging from transformers import GemmaTokenizer # Import GemmaTokenizer # Configure logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') def get_tokenizer(id): logging.debug(f"Loading tokenizer: {id}") try: if "gemma" in id.lower(): tokenizer = GemmaTokenizer.from_pretrained(id) else: tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True) logging.debug(f"Tokenizer loaded: {tokenizer}") return tokenizer except Exception as e: logging.error(f"Error loading tokenizer {id}: {e}") raise e def get_tokenization(tokenizer, text): logging.debug(f"Tokenizing text: {text}") ids = tokenizer.encode(text) string_tokens = tokenizer.convert_ids_to_tokens(ids) logging.debug(f"Tokens: {string_tokens}") return string_tokens def get_vocab_size(tokenizer): logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}") vocab_size = len(tokenizer.get_vocab()) logging.debug(f"Vocabulary size: {vocab_size}") return vocab_size def check_latin_support(tokenizer): logging.debug(f"Checking Latin support for tokenizer: {tokenizer}") try: test_text = "This is a test with latin characters 1234567890." tokens = tokenizer.tokenize(test_text) # If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin if all(token != tokenizer.unk_token for token in tokens): logging.debug(f"Latin support: ✅") return "✅" else: logging.debug(f"Latin support: ❌") return "❌" except Exception as e: logging.error(f"Error checking latin support: {e}") return "❌"