#!/usr/bin/env python3 from datasets import load_dataset from datasets import load_from_disk from tokenizers import ByteLevelBPETokenizer from tqdm import tqdm # load dataset # dataset = load_dataset("oscar", "unshuffled_deduplicated_hi", split="train") dataset = load_from_disk("/home/rtx/work/dk/hf/vo") # Instantiate tokenizer tokenizer = ByteLevelBPETokenizer(add_prefix_space=True) def batch_iterator(batch_size=100_000): for i in range(0, len(dataset), batch_size): yield dataset[i: i + batch_size]["text"] # Customized training tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=50, special_tokens=[ "", "", "", "", "", ]) # Save files to disk tokenizer.save("./tokenizer.json")