from datasets import load_dataset, concatenate_datasets | |
from tokenizers import trainers, Tokenizer, normalizers | |
from t5_tokenizer_model import SentencePieceUnigramTokenizer | |
vocab_size = 50_000 | |
input_sentence_size = None | |
model_dir = "./" # ${MODEL_DIR} | |
# Initialize a dataset | |
dataset = load_dataset("json", data_files=["/mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json","/mnt/disks/flaxdisk/corpus/special_chars.json"], split='train') | |
tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>") | |
# Build an iterator over this dataset | |
def batch_iterator(input_sentence_size=None): | |
if input_sentence_size is None: | |
input_sentence_size = len(dataset) | |
batch_length = 100 | |
for i in range(0, input_sentence_size, batch_length): | |
yield dataset[i: i + batch_length]["text"] | |
# Train tokenizer | |
tokenizer.train_from_iterator( | |
iterator=batch_iterator(input_sentence_size=input_sentence_size), | |
vocab_size=vocab_size, | |
show_progress=True, | |
) | |
# Save files to disk | |
tokenizer.save(f"{model_dir}/tokenizer.json") | |