|
from tokenizers import Tokenizer |
|
from tokenizers.models import Unigram |
|
from tokenizers.trainers import UnigramTrainer |
|
from tokenizers.normalizers import NFKC |
|
from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence |
|
from datasets import load_dataset |
|
|
|
mc4 = load_dataset('mc4', 'bn', split='train') |
|
|
|
def batch_iterator(dataset, batch_size=1000): |
|
for i in range(0, len(dataset), batch_size): |
|
yield dataset[i: i + batch_size]["text"] |
|
|
|
tokenizer = Tokenizer(Unigram()) |
|
tokenizer.normalizer = NFKC() |
|
tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)]) |
|
trainer = UnigramTrainer(vocab_size=32000, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>']) |
|
tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4)) |
|
tokenizer.save('tokenizer-mc4-unigram.json') |
|
|
|
|