File size: 1,359 Bytes
f291f93 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 |
import glob
import random
from tokenizers import ByteLevelBPETokenizer
from datasets import load_dataset
data_files = []
def add_jsonlines_dir(path, filespec):
global data_files
data_files += glob.glob(f"{path}/{filespec}")
data_files = list(set(data_files))
print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
random.Random(42).shuffle(data_files)
total = len(data_files)
print(total)
perc = 0.05
val_size = int(perc * total)
train_size = total - val_size
train = data_files[:train_size]
val = data_files[train_size:]
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
assert list(set(train) & set(val)) == [], "Train overlaps with test"
datasets = load_dataset('json', data_files={'train': train, 'validation': val})
tokenizer = ByteLevelBPETokenizer()
def batch_iterator(batch_size=1000):
for i in range(0, len(datasets), batch_size):
yield datasets["train"][i: i + batch_size]["text"]
tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
])
tokenizer.save("tokenizer.json") |