|
|
|
from tokenizers import Tokenizer |
|
from tokenizers.models import BPE |
|
from tokenizers.trainers import BpeTrainer |
|
from tokenizers.pre_tokenizers import Whitespace |
|
from datasets import load_dataset |
|
|
|
tokenizer = Tokenizer(BPE(unk_token="[UNK]")) |
|
tokenizer.pre_tokenizer = Whitespace() |
|
|
|
|
|
|
|
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"], vocab_size=30000) |
|
|
|
dataset = load_dataset('range3/cc100-ja') |
|
|
|
def dataset_iter(): |
|
|
|
|
|
skip=100 |
|
for i in range(0, len(dataset['train']), skip): |
|
yield dataset['train'][i]['text'] |
|
|
|
tokenizer.train_from_iterator(dataset_iter(), trainer) |
|
tokenizer.save('data/tokenizer-cc100-ja.json') |
|
|