File size: 2,960 Bytes
1d4a13a
 
 
 
 
 
34a1eb7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d4a13a
 
 
 
 
 
34a1eb7
 
 
 
 
1d4a13a
34a1eb7
 
 
 
1d4a13a
34a1eb7
00a7216
 
 
1d4a13a
34a1eb7
1d4a13a
 
 
34a1eb7
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from clean import clean_text

from datasets import load_dataset

dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl", split='train', streaming=True)

# data_dir = "/home/yeb"
data_dir = "/home/yeb/Developer/data"
data_files = []

def train_val_files():
    import glob
    import random
    SEED = 12345

    def add_jsonlines_dir(path, filespec):
        global data_files
        data_files += glob.glob(f"{path}/{filespec}")
        data_files = list(set(data_files))
        print(f"Number of files {len(data_files)} after adding {path} glob {filespec}")

    # add_jsonlines_dir(f"{data_dir}/oscar_nl_cleaned")
    add_jsonlines_dir(f"{data_dir}/c4_cleaned2", "*73*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*47*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*12*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*29*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*74*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*26*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*54*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*68*.gz")
#     add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*57*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*46*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*35*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*13*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*41*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*52*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*63*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*85*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*81*.gz")
    # add_jsonlines_dir(f"{data_dir}/c4_cleaned", "*96*.gz")
    # add_jsonlines_dir(f"{data_dir}/nrc_uniq_cleaned_20210223", "*.gz")
#     add_jsonlines_dir(f"{data_dir}/nu_uniq_cleaned_20210225", "*.gz")
    random.Random(SEED).shuffle(data_files)

    total = len(data_files)
    print(total)
    perc = 0.05
    val_size = int(perc * total)
    train_size = total - val_size
    train = data_files[:train_size]
    val = data_files[train_size:]
    print(f"Got {len(train)} training files and {perc * 100} % {len(val)} validation files")

    assert list(set(train) & set(val)) == [], "Train overlaps with test"

    return train, val

train, val = train_val_files()
dataset_v0 = load_dataset('json', data_files={'train': train, 'validation': val})


dataset_v0 = load_dataset('oscar', "unshuffled_deduplicated_nl")

def f(obj):
    obj["text"] = clean_text(obj["text"])
    return obj


dataset_v1 = dataset_v0.map(
    f,
    batched=False,
    num_proc=10,
)

datasets = dataset_v1.filter(
    lambda obj: obj['text'] is not None,
    num_proc=10,
)

it = iter(dataset_v0['train'])
print(next(it))
print(next(it))
print(next(it))

it = iter(dataset_v1['train'])
print(next(it))
print(next(it))
print(next(it))

# it = iter(dataset_v2)
# print(next(it))
# print(next(it))
# print(next(it))