nouamanetazi's picture
nouamanetazi HF staff
Upload utils.py with huggingface_hub
f43ae3f verified
raw
history blame
1.85 kB
from transformers import AutoTokenizer
import logging
from transformers import GemmaTokenizer # Import GemmaTokenizer
# Configure logging
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
def get_tokenizer(id):
logging.debug(f"Loading tokenizer: {id}")
try:
if "gemma" in id.lower():
tokenizer = GemmaTokenizer.from_pretrained(id)
else:
tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True)
logging.debug(f"Tokenizer loaded: {tokenizer}")
return tokenizer
except Exception as e:
logging.error(f"Error loading tokenizer {id}: {e}")
raise e
def get_tokenization(tokenizer, text):
logging.debug(f"Tokenizing text: {text}")
ids = tokenizer.encode(text)
string_tokens = tokenizer.convert_ids_to_tokens(ids)
logging.debug(f"Tokens: {string_tokens}")
return string_tokens
def get_vocab_size(tokenizer):
logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}")
vocab_size = len(tokenizer.get_vocab())
logging.debug(f"Vocabulary size: {vocab_size}")
return vocab_size
def check_latin_support(tokenizer):
logging.debug(f"Checking Latin support for tokenizer: {tokenizer}")
try:
test_text = "This is a test with latin characters 1234567890."
tokens = tokenizer.tokenize(test_text)
# If the tokenizer can tokenize the latin text without returning unknown tokens, we consider it as supporting latin
if all(token != tokenizer.unk_token for token in tokens):
logging.debug(f"Latin support: βœ…")
return "βœ…"
else:
logging.debug(f"Latin support: ❌")
return "❌"
except Exception as e:
logging.error(f"Error checking latin support: {e}")
return "❌"