| import os | |
| import numpy as np | |
| from datasets import load_dataset | |
| from transformers import AutoTokenizer | |
| from tqdm import tqdm | |
| DATA_DIR = "data/wikitext-103" | |
| def prepare(): | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| print("📥 Loading wikitext-103-v1 from HuggingFace datasets...") | |
| dataset = load_dataset("wikitext", "wikitext-103-v1") | |
| print("📥 Loading GPT-2 tokenizer...") | |
| tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
| for split in ['train', 'validation', 'test']: | |
| output_file = os.path.join(DATA_DIR, f"{split}.bin") | |
| if os.path.exists(output_file): | |
| print(f"⏩ {output_file} already exists.") | |
| continue | |
| print(f"⚗️ Tokenizing {split} split...") | |
| all_ids = [] | |
| # Process in chunks to give progress bar | |
| for text in tqdm(dataset[split]['text'], desc=split): | |
| if not text.strip(): | |
| continue | |
| # Adding eos_token_id to separate lines as GPT2 generally does not encode \n perfectly or we want clean boundaries | |
| ids = tokenizer.encode(text) + [tokenizer.eos_token_id] | |
| all_ids.extend(ids) | |
| arr = np.array(all_ids, dtype=np.uint32) | |
| arr.tofile(output_file) | |
| print(f"✅ {split}.bin created: {len(arr)} tokens.") | |
| if __name__ == '__main__': | |
| prepare() | |