|
import os |
|
|
|
def prepare_enwik8(input_file, output_dir): |
|
""" |
|
Prepare enwik8 dataset from enwik9: |
|
- Extract first 100M bytes for enwik8 |
|
- Split into train (90M), val (5M), and test (5M) |
|
""" |
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
with open(input_file, 'rb') as f: |
|
data = f.read(100_000_000) |
|
|
|
|
|
train_data = data[:90_000_000] |
|
val_data = data[90_000_000:95_000_000] |
|
test_data = data[95_000_000:] |
|
|
|
|
|
splits = { |
|
'train.bin': train_data, |
|
'val.bin': val_data, |
|
'test.bin': test_data |
|
} |
|
|
|
for name, split_data in splits.items(): |
|
with open(os.path.join(output_dir, name), 'wb') as f: |
|
f.write(split_data) |
|
print(f"Saved {name} ({len(split_data):,} bytes)") |
|
|
|
if __name__ == "__main__": |
|
input_file = "enwik9/enwik9" |
|
output_dir = "data" |
|
prepare_enwik8(input_file, output_dir) |
|
print("Dataset preparation completed!") |
|
|