Kyworn
/

pentanet-124m

native-quantization

extreme-quantization

Eval Results (legacy)

Model card Files Files and versions

pentanet-124m / prepare_data.py

Kyworn's picture

Upload folder using huggingface_hub

80603a0 verified about 2 months ago

history blame contribute delete

1.36 kB

	import os
	import numpy as np
	from datasets import load_dataset
	from transformers import AutoTokenizer
	from tqdm import tqdm

	DATA_DIR = "data/wikitext-103"

	def prepare():
	os.makedirs(DATA_DIR, exist_ok=True)
	print("📥 Loading wikitext-103-v1 from HuggingFace datasets...")
	dataset = load_dataset("wikitext", "wikitext-103-v1")

	print("📥 Loading GPT-2 tokenizer...")
	tokenizer = AutoTokenizer.from_pretrained("gpt2")

	for split in ['train', 'validation', 'test']:
	output_file = os.path.join(DATA_DIR, f"{split}.bin")
	if os.path.exists(output_file):
	print(f"⏩ {output_file} already exists.")
	continue

	print(f"⚗️ Tokenizing {split} split...")
	all_ids = []
	# Process in chunks to give progress bar
	for text in tqdm(dataset[split]['text'], desc=split):
	if not text.strip():
	continue
	# Adding eos_token_id to separate lines as GPT2 generally does not encode \n perfectly or we want clean boundaries
	ids = tokenizer.encode(text) + [tokenizer.eos_token_id]
	all_ids.extend(ids)

	arr = np.array(all_ids, dtype=np.uint32)
	arr.tofile(output_file)
	print(f"✅ {split}.bin created: {len(arr)} tokens.")

	if __name__ == '__main__':
	prepare()