File size: 545 Bytes
a8639ac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
from datasets import (
    load_dataset,
)  # How presumptuous to have HF call their dataset library "datasets"
import os


dataset = load_dataset("Salesforce/wikitext", "wikitext-103-raw-v1")


def save_preprocessed(output_file, split="train"):
    data_split = dataset[split]

    separator = " <EOF> "
    all_text = separator.join(data_split["text"])

    with open(output_file, "w", encoding="utf-8") as f:
        f.write(all_text)


save_preprocessed(
    os.path.expanduser("~/torch_datasets/wikitext/train/data/corpus_processed.txt")
)