from tokenizers import Tokenizer from tokenizers.models import Unigram from tokenizers.trainers import UnigramTrainer from tokenizers.normalizers import NFKC from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence from datasets import load_dataset mc4 = load_dataset('mc4', 'bn', split='train') def batch_iterator(dataset, batch_size=1000): for i in range(0, len(dataset), batch_size): yield dataset[i: i + batch_size]["text"] tokenizer = Tokenizer(Unigram()) tokenizer.normalizer = NFKC() tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)]) trainer = UnigramTrainer(vocab_size=32000, special_tokens=['', '', '', '', '']) tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4)) tokenizer.save('tokenizer-mc4-unigram.json')