File size: 927 Bytes
f342985 f2449c8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 |
from datasets import load_dataset
from transformers import AutoTokenizer
def get_training_corpus(dataset):
"""
Returns the training corpus for the given dataset.
"""
return (element['original_ja'] for element in iter(dataset))
dataset = load_dataset("snow_simplified_japanese_corpus", streaming=True, split="train")
train_dataset = dataset.skip(100)
val_dataset = dataset.take(100)
old_tokenizer = AutoTokenizer.from_pretrained("rinna/japanese-gpt2-small")
old_tokenizer = AutoTokenizer.from_pretrained("csebuetnlp/mT5_multilingual_XLSum")
print("Old Tokenizer:", old_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
new_tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(train_dataset), 52000)
print("New Tokenizer:",new_tokenizer.tokenize("誰が一番に着くか私には分かりません。"))
new_tokenizer.save_pretrained("japanese-dummy-tokenizer") |