mini-sun-init-bert-tf-110m / tokenizer_make.py
finnstrom3693's picture
Create tokenizer_make.py
b71846a verified
raw
history blame
1.95 kB
from transformers import BertTokenizerFast
import os
class MiniSunTokenizer:
def __init__(self, vocab_file=None):
# You can use BERT's tokenizer or any custom vocabulary tokenizer
if vocab_file:
self.tokenizer = BertTokenizerFast(vocab_file=vocab_file, do_lower_case=False)
else:
# Default BERT tokenizer without a specific vocab file
self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
# Define special tokens if needed (customizable)
self.pad_token = '[PAD]'
self.unk_token = '[UNK]'
self.cls_token = '[CLS]'
self.sep_token = '[SEP]'
self.mask_token = '[MASK]'
def tokenize(self, text):
# Tokenizes the input text
return self.tokenizer.tokenize(text)
def encode(self, text, max_length=512, padding=True, truncation=True):
# Converts the text into input IDs and attention mask
encoded = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=max_length,
padding='max_length' if padding else False,
truncation=truncation,
return_attention_mask=True,
return_tensors='tf'
)
return encoded['input_ids'], encoded['attention_mask']
def decode(self, token_ids):
# Decodes token IDs back into text
return self.tokenizer.decode(token_ids, skip_special_tokens=True)
def save_pretrained(self, save_directory):
# Save the tokenizer in Hugging Face format
os.makedirs(save_directory, exist_ok=True)
self.tokenizer.save_pretrained(save_directory)
# Example usage of the tokenizer
tokenizer = MiniSunTokenizer()
text = "Hello, this is a test sentence for MiniSun model."
input_ids, attention_mask = tokenizer.encode(text, max_length=20)
print("Input IDs:", input_ids)
print("Attention Mask:", attention_mask)