eyad-silx
/

llm

Model card Files Files and versions

llm / prepare_data.py

eyad-silx

Update repository

d278d9d 9 months ago

history blame contribute delete

1.15 kB

	import os

	def prepare_enwik8(input_file, output_dir):
	"""
	Prepare enwik8 dataset from enwik9:
	- Extract first 100M bytes for enwik8
	- Split into train (90M), val (5M), and test (5M)
	"""
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Read first 100M bytes from enwik9
	with open(input_file, 'rb') as f:
	data = f.read(100_000_000) # Read exactly 100M bytes

	# Split the data
	train_data = data[:90_000_000] # First 90M bytes
	val_data = data[90_000_000:95_000_000] # Next 5M bytes
	test_data = data[95_000_000:] # Last 5M bytes

	# Save splits
	splits = {
	'train.bin': train_data,
	'val.bin': val_data,
	'test.bin': test_data
	}

	for name, split_data in splits.items():
	with open(os.path.join(output_dir, name), 'wb') as f:
	f.write(split_data)
	print(f"Saved {name} ({len(split_data):,} bytes)")

	if __name__ == "__main__":
	input_file = "enwik9/enwik9"
	output_dir = "data"
	prepare_enwik8(input_file, output_dir)
	print("Dataset preparation completed!")