Bengali-t5 / train_unigram_tokenizer_for_t5.py
ibraheemmoosa's picture
Add tokenizer and model training script
c43ef85
raw
history blame
842 Bytes
from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.normalizers import NFKC
from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence
from datasets import load_dataset
mc4 = load_dataset('mc4', 'bn', split='train')
def batch_iterator(dataset, batch_size=1000):
for i in range(0, len(dataset), batch_size):
yield dataset[i: i + batch_size]["text"]
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)])
trainer = UnigramTrainer(vocab_size=32000, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])
tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4))
tokenizer.save('tokenizer-mc4-unigram.json')