from datasets import load_dataset, concatenate_datasets from tokenizers import trainers, Tokenizer, normalizers from t5_tokenizer_model import SentencePieceUnigramTokenizer vocab_size = 50_000 input_sentence_size = None model_dir = "./" # ${MODEL_DIR} # Initialize a dataset dataset = load_dataset("json", data_files=["/mnt/disks/flaxdisk/corpus/norwegian_colossal_corpus_validation.json","/mnt/disks/flaxdisk/corpus/special_chars.json"], split='train') tokenizer = SentencePieceUnigramTokenizer(unk_token="", eos_token="", pad_token="") # Build an iterator over this dataset def batch_iterator(input_sentence_size=None): if input_sentence_size is None: input_sentence_size = len(dataset) batch_length = 100 for i in range(0, input_sentence_size, batch_length): yield dataset[i: i + batch_length]["text"] # Train tokenizer tokenizer.train_from_iterator( iterator=batch_iterator(input_sentence_size=input_sentence_size), vocab_size=vocab_size, show_progress=True, ) # Save files to disk tokenizer.save(f"{model_dir}/tokenizer.json")