File size: 1,359 Bytes
f291f93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
import glob
import random
from tokenizers import ByteLevelBPETokenizer
from datasets import load_dataset

data_files = []
def add_jsonlines_dir(path, filespec):
    global data_files
    data_files += glob.glob(f"{path}/{filespec}")
    data_files = list(set(data_files))
    print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")
add_jsonlines_dir(f"/data/c4_cleaned2", "*.gz")
add_jsonlines_dir(f"/data/nrc_uniq_cleaned_20210223", "*.gz")
add_jsonlines_dir(f"/data/nu_uniq_cleaned_20210225", "*.gz")
random.Random(42).shuffle(data_files)
total = len(data_files)
print(total)
perc = 0.05
val_size = int(perc * total)
train_size = total - val_size
train = data_files[:train_size]
val = data_files[train_size:]
print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")
assert list(set(train) & set(val)) == [], "Train overlaps with test"
datasets = load_dataset('json', data_files={'train': train, 'validation': val})



tokenizer = ByteLevelBPETokenizer()

def batch_iterator(batch_size=1000):
    for i in range(0, len(datasets), batch_size):
        yield datasets["train"][i: i + batch_size]["text"]

tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])

tokenizer.save("tokenizer.json")