File size: 496 Bytes
cc8e143
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import argparse

from datasets import load_dataset


parser = argparse.ArgumentParser()
parser.add_argument("path_prefix")
parser.add_argument("output_path")
args = parser.parse_args()

path_prefix: str = args.path_prefix
output_path: str = args.output_path

dataset = load_dataset(
    "text",
    data_files={
        "train": [path_prefix + "_train_text.txt"],
        "validation": [path_prefix + "_val_text.txt"],
    },
    cache_dir="/dev/shm/.cache",
)

dataset.save_to_disk(output_path)