from datasets import ( | |
load_dataset, | |
) # How presumptuous to have HF call their dataset library "datasets" | |
import os | |
dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1") | |
def save_preprocessed(output_file, split="train"): | |
data_split = dataset[split] | |
separator = " <EOF> " | |
all_text = separator.join(data_split["text"]) | |
with open(output_file, "w", encoding="utf-8") as f: | |
f.write(all_text) | |
save_preprocessed( | |
os.path.expanduser("~/torch_datasets/wikitext/train/data/corpus_processed.txt") | |
) | |