Crest-20M-Base / prepare.py
LH-Tech-AI's picture
Create prepare.py
c496de2 verified
import os
import tiktoken
import numpy as np
from datasets import load_dataset
from tqdm import tqdm
DATASET_NAME = "HuggingFaceFW/fineweb-edu"
SAMPLE_NAME = "sample-10BT"
TARGET_TOKENS = 100_000_000
NUM_PROC = 8
enc = tiktoken.get_encoding("gpt2")
def process(example):
ids = enc.encode_ordinary(example['text'])
ids.append(enc.eot_token)
return {'ids': ids, 'len': len(ids)}
if __name__ == "__main__":
print(f"Loading streaming dataset {DATASET_NAME}...")
dataset = load_dataset(DATASET_NAME, name=SAMPLE_NAME, split='train', streaming=True)
all_tokens = []
total_tokens = 0
pbar = tqdm(total=TARGET_TOKENS, desc="Collecting tokens")
for example in dataset:
tokens = process(example)['ids']
all_tokens.extend(tokens)
total_tokens += len(tokens)
pbar.update(len(tokens))
if total_tokens >= TARGET_TOKENS:
break
pbar.close()
n = len(all_tokens)
train_data = all_tokens[:int(n*0.95)]
val_data = all_tokens[int(n*0.95):]
for name, d in [('train', train_data), ('val', val_data)]:
arr = np.array(d, dtype=np.uint16)
filename = f"{name}.bin"
arr.tofile(filename)
print(f"Saved {filename} with {len(d):,} tokens.")
print("\nDone!")