llm / prepare_data.py
eyad-silx
Update repository
d278d9d
import os
def prepare_enwik8(input_file, output_dir):
"""
Prepare enwik8 dataset from enwik9:
- Extract first 100M bytes for enwik8
- Split into train (90M), val (5M), and test (5M)
"""
# Create output directory if it doesn't exist
os.makedirs(output_dir, exist_ok=True)
# Read first 100M bytes from enwik9
with open(input_file, 'rb') as f:
data = f.read(100_000_000) # Read exactly 100M bytes
# Split the data
train_data = data[:90_000_000] # First 90M bytes
val_data = data[90_000_000:95_000_000] # Next 5M bytes
test_data = data[95_000_000:] # Last 5M bytes
# Save splits
splits = {
'train.bin': train_data,
'val.bin': val_data,
'test.bin': test_data
}
for name, split_data in splits.items():
with open(os.path.join(output_dir, name), 'wb') as f:
f.write(split_data)
print(f"Saved {name} ({len(split_data):,} bytes)")
if __name__ == "__main__":
input_file = "enwik9/enwik9"
output_dir = "data"
prepare_enwik8(input_file, output_dir)
print("Dataset preparation completed!")