import argparse | |
from datasets import load_dataset | |
parser = argparse.ArgumentParser() | |
parser.add_argument("path_prefix") | |
parser.add_argument("output_path") | |
args = parser.parse_args() | |
path_prefix: str = args.path_prefix | |
output_path: str = args.output_path | |
dataset = load_dataset( | |
"text", | |
data_files={ | |
"train": [path_prefix + "_train_text.txt"], | |
"validation": [path_prefix + "_val_text.txt"], | |
}, | |
cache_dir="/dev/shm/.cache", | |
) | |
dataset.save_to_disk(output_path) | |