|
from transformers import AutoTokenizer |
|
import logging |
|
from transformers import GemmaTokenizer |
|
|
|
|
|
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') |
|
|
|
def get_tokenizer(id): |
|
logging.debug(f"Loading tokenizer: {id}") |
|
try: |
|
if "gemma" in id.lower(): |
|
tokenizer = GemmaTokenizer.from_pretrained(id) |
|
else: |
|
tokenizer = AutoTokenizer.from_pretrained(id, trust_remote_code=True) |
|
logging.debug(f"Tokenizer loaded: {tokenizer}") |
|
return tokenizer |
|
except Exception as e: |
|
logging.error(f"Error loading tokenizer {id}: {e}") |
|
raise e |
|
|
|
def get_tokenization(tokenizer, text): |
|
logging.debug(f"Tokenizing text: {text}") |
|
ids = tokenizer.encode(text) |
|
string_tokens = tokenizer.convert_ids_to_tokens(ids) |
|
logging.debug(f"Tokens: {string_tokens}") |
|
return string_tokens |
|
|
|
def get_vocab_size(tokenizer): |
|
logging.debug(f"Getting vocabulary size for tokenizer: {tokenizer}") |
|
vocab_size = len(tokenizer.get_vocab()) |
|
logging.debug(f"Vocabulary size: {vocab_size}") |
|
return vocab_size |
|
|
|
def check_latin_support(tokenizer): |
|
logging.debug(f"Checking Latin support for tokenizer: {tokenizer}") |
|
try: |
|
test_text = "This is a test with latin characters 1234567890." |
|
tokens = tokenizer.tokenize(test_text) |
|
|
|
if all(token != tokenizer.unk_token for token in tokens): |
|
logging.debug(f"Latin support: β
") |
|
return "β
" |
|
else: |
|
logging.debug(f"Latin support: β") |
|
return "β" |
|
except Exception as e: |
|
logging.error(f"Error checking latin support: {e}") |
|
return "β" |
|
|