File size: 842 Bytes
c43ef85 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
from tokenizers import Tokenizer
from tokenizers.models import Unigram
from tokenizers.trainers import UnigramTrainer
from tokenizers.normalizers import NFKC
from tokenizers.pre_tokenizers import UnicodeScripts, Digits, Sequence
from datasets import load_dataset
mc4 = load_dataset('mc4', 'bn', split='train')
def batch_iterator(dataset, batch_size=1000):
for i in range(0, len(dataset), batch_size):
yield dataset[i: i + batch_size]["text"]
tokenizer = Tokenizer(Unigram())
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = Sequence([UnicodeScripts(), Digits(individual_digits=True)])
trainer = UnigramTrainer(vocab_size=32000, special_tokens=['<s>', '<pad>', '</s>', '<unk>', '<mask>'])
tokenizer.train_from_iterator(batch_iterator(mc4), trainer=trainer, length=len(mc4))
tokenizer.save('tokenizer-mc4-unigram.json')
|