|
from datasets import load_dataset, concatenate_datasets |
|
from tokenizers import ByteLevelBPETokenizer |
|
from pathlib import Path |
|
|
|
dataset_language = "su" |
|
validation_split_percentage = 10 |
|
|
|
|
|
|
|
oscar = load_dataset( |
|
"oscar", f"unshuffled_deduplicated_{dataset_language}", split="train", |
|
) |
|
|
|
cc100 = load_dataset("cc100", lang=dataset_language, split="train") |
|
|
|
mc4 = load_dataset("mc4", dataset_language, split="train") |
|
|
|
wiki_files = [str(x) for x in Path("../docs").glob("*.txt")] |
|
wiki = load_dataset("text", data_files=wiki_files) |
|
|
|
|
|
oscar = oscar.remove_columns("id") |
|
mc4 = mc4.remove_columns(["url", "timestamp"]) |
|
cc100 = cc100.remove_columns("id") |
|
|
|
dataset = concatenate_datasets([oscar, mc4, cc100, wiki["train"]]) |
|
dataset = dataset.train_test_split(test_size=validation_split_percentage / 100, seed=42) |
|
|
|
|
|
tokenizer = ByteLevelBPETokenizer() |
|
|
|
|
|
def batch_iterator(batch_size=10000): |
|
for i in range(0, len(dataset), batch_size): |
|
yield dataset["train"][i : i + batch_size]["text"] |
|
|
|
|
|
|
|
tokenizer.train_from_iterator( |
|
batch_iterator(), |
|
vocab_size=50265, |
|
min_frequency=2, |
|
special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>",], |
|
) |
|
|
|
|
|
model_dir = "." |
|
tokenizer.save(f"{model_dir}/tokenizer.json") |
|
|
|
|