|
import glob |
|
import random |
|
from tokenizers import ByteLevelBPETokenizer |
|
from datasets import load_dataset |
|
|
|
data_files = [] |
|
def add_jsonlines_dir(path, filespec): |
|
global data_files |
|
data_files += glob.glob(f"{path}/{filespec}") |
|
data_files = list(set(data_files)) |
|
print(f"Number of files {len(data_files)} after adding {path} glob {filespec}") |
|
add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz") |
|
add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz") |
|
add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz") |
|
random.Random(42).shuffle(data_files) |
|
total = len(data_files) |
|
print(total) |
|
perc = 0.05 |
|
val_size = int(perc * total) |
|
train_size = total - val_size |
|
train = data_files[:train_size] |
|
val = data_files[train_size:] |
|
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files") |
|
assert list(set(train) & set(val)) == [], "Train overlaps with test" |
|
datasets = load_dataset('json', data_files={'train': train, 'validation': val}) |
|
|
|
|
|
|
|
tokenizer = ByteLevelBPETokenizer() |
|
|
|
def batch_iterator(batch_size=1000): |
|
for i in range(0, len(datasets), batch_size): |
|
yield datasets["train"][i: i + batch_size]["text"] |
|
|
|
tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[ |
|
"<s>", |
|
"<pad>", |
|
"</s>", |
|
"<unk>", |
|
"<mask>", |
|
]) |
|
|
|
tokenizer.save("tokenizer.json") |