from datasets import load_dataset, concatenate_datasets
from tokenizers import ByteLevelBPETokenizer
from pathlib import Path
dataset_language = "su"
validation_split_percentage = 10
# load dataset
# only the train subset for tokenizing purposes
oscar = load_dataset(
"oscar", f"unshuffled_deduplicated_{dataset_language}", split="train",
)
cc100 = load_dataset("cc100", lang=dataset_language, split="train")
mc4 = load_dataset("mc4", dataset_language, split="train")
wiki_files = [str(x) for x in Path("../docs").glob("*.txt")]
wiki = load_dataset("text", data_files=wiki_files)
# want: text column only!
oscar = oscar.remove_columns("id")
mc4 = mc4.remove_columns(["url", "timestamp"])
cc100 = cc100.remove_columns("id")
dataset = concatenate_datasets([oscar, mc4, cc100, wiki["train"]])
dataset = dataset.train_test_split(test_size=validation_split_percentage / 100, seed=42)
# Instantiate tokenizer
tokenizer = ByteLevelBPETokenizer()
def batch_iterator(batch_size=10000):
for i in range(0, len(dataset), batch_size):
yield dataset["train"][i : i + batch_size]["text"]
# Customized training
tokenizer.train_from_iterator(
batch_iterator(),
vocab_size=50265,
min_frequency=2,
special_tokens=["", "", "", "", "",],
)
# Save files to disk
model_dir = "."
tokenizer.save(f"{model_dir}/tokenizer.json")