import glob import random from tokenizers import ByteLevelBPETokenizer from datasets import load_dataset data_files = [] def add_jsonlines_dir(path, filespec): global data_files data_files += glob.glob(f"{path}/{filespec}") data_files = list(set(data_files)) print(f"Number of files {len(data_files)} after adding {path} glob {filespec}") add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz") add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz") add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz") random.Random(42).shuffle(data_files) total = len(data_files) print(total) perc = 0.05 val_size = int(perc * total) train_size = total - val_size train = data_files[:train_size] val = data_files[train_size:] print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") assert list(set(train) & set(val)) == [], "Train overlaps with test" datasets = load_dataset('json', data_files={'train': train, 'validation': val}) tokenizer = ByteLevelBPETokenizer() def batch_iterator(batch_size=1000): for i in range(0, len(datasets), batch_size): yield datasets["train"][i: i + batch_size]["text"] tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[ "", "", "", "", "", ]) tokenizer.save("tokenizer.json")