nano-coder-zerogpu / prepare_code_dataset.py

Upload prepare_code_dataset.py with huggingface_hub

fe53008 verified 2 months ago

4.36 kB

	"""
	Data preparation script for training nanoGPT on the flytech/python-codes-25k dataset.
	This script downloads the dataset, tokenizes it, and creates the binary files needed for training.
	"""

	import os
	import pickle
	import numpy as np
	from datasets import load_dataset
	from tqdm import tqdm

	def download_and_prepare_code_dataset():
	"""Download and prepare the flytech/python-codes-25k dataset for nanoGPT training."""

	print("Loading flytech/python-codes-25k dataset...")
	dataset = load_dataset("flytech/python-codes-25k")

	print(f"Dataset structure: {dataset}")
	print(f"Available splits: {list(dataset.keys())}")
	print(f"Train split size: {len(dataset['train'])}")

	# Debug: Check the first few examples to understand the structure
	print("\nFirst example structure:")
	first_example = dataset['train'][0]
	for key, value in first_example.items():
	print(f" {key}: {repr(value[:200])}...") # Show first 200 chars

	# Create data directory
	data_dir = os.path.join('data', 'python-codes-25k')
	os.makedirs(data_dir, exist_ok=True)

	# Extract code content from the dataset
	print("Extracting code content...")
	train_texts = []
	test_texts = []

	# Process training data
	for item in tqdm(dataset['train'], desc="Processing train split"):
	# Try different possible field names for code content
	code = item.get('text', '') or item.get('output', '') or item.get('code', '')
	if code and isinstance(code, str) and len(code.strip()) > 0:
	train_texts.append(code)

	# Split training data into train and validation sets (90/10 split)
	print("Splitting data into train and validation sets...")
	total_samples = len(train_texts)
	split_idx = int(0.9 * total_samples)

	train_texts_final = train_texts[:split_idx]
	test_texts = train_texts[split_idx:] # Use last 10% as validation

	print(f"Final train samples: {len(train_texts_final)}")
	print(f"Validation samples: {len(test_texts)}")

	print(f"Extracted {len(train_texts)} total samples")

	# Combine all texts for vocabulary building
	all_text = '\n'.join(train_texts_final + test_texts)
	print(f"Total characters: {len(all_text):,}")

	# Create vocabulary from the text
	print("Creating vocabulary...")
	chars = sorted(list(set(all_text)))
	vocab_size = len(chars)
	print(f"Vocabulary size: {vocab_size}")

	# Create character to integer mapping
	stoi = {ch: i for i, ch in enumerate(chars)}
	itos = {i: ch for i, ch in enumerate(chars)}

	# Save vocabulary metadata
	meta = {
	'vocab_size': vocab_size,
	'itos': itos,
	'stoi': stoi,
	}
	with open(os.path.join(data_dir, 'meta.pkl'), 'wb') as f:
	pickle.dump(meta, f)
	print(f"Saved vocabulary to {os.path.join(data_dir, 'meta.pkl')}")

	# Tokenize and save training data
	print("Tokenizing training data...")
	train_ids = []
	for text in tqdm(train_texts_final, desc="Tokenizing train"):
	ids = [stoi[c] for c in text]
	train_ids.extend(ids)

	# Tokenize and save test data
	print("Tokenizing test data...")
	test_ids = []
	for text in tqdm(test_texts, desc="Tokenizing test"):
	ids = [stoi[c] for c in text]
	test_ids.extend(ids)

	# Save as binary files
	train_ids = np.array(train_ids, dtype=np.uint16)
	test_ids = np.array(test_ids, dtype=np.uint16)

	train_path = os.path.join(data_dir, 'train.bin')
	test_path = os.path.join(data_dir, 'val.bin') # nanoGPT expects 'val.bin'

	train_ids.tofile(train_path)
	test_ids.tofile(test_path)

	print(f"Saved training data to {train_path} ({len(train_ids):,} tokens)")
	print(f"Saved validation data to {test_path} ({len(test_ids):,} tokens)")

	# Print some statistics
	print(f"\nDataset statistics:")
	print(f"Vocabulary size: {vocab_size}")
	print(f"Training tokens: {len(train_ids):,}")
	print(f"Validation tokens: {len(test_ids):,}")
	print(f"Total tokens: {len(train_ids) + len(test_ids):,}")

	# Show some example characters
	print(f"\nFirst 100 characters in vocabulary:")
	print(''.join(chars[:100]))

	return data_dir

	if __name__ == '__main__':
	download_and_prepare_code_dataset()