LH-Tech-AI commited on
Commit
c496de2
·
verified ·
1 Parent(s): d6dfa66

Create prepare.py

Browse files
Files changed (1) hide show
  1. prepare.py +46 -0
prepare.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+ import numpy as np
4
+ from datasets import load_dataset
5
+ from tqdm import tqdm
6
+
7
+ DATASET_NAME = "HuggingFaceFW/fineweb-edu"
8
+ SAMPLE_NAME = "sample-10BT"
9
+ TARGET_TOKENS = 100_000_000
10
+ NUM_PROC = 8
11
+
12
+ enc = tiktoken.get_encoding("gpt2")
13
+
14
+ def process(example):
15
+ ids = enc.encode_ordinary(example['text'])
16
+ ids.append(enc.eot_token)
17
+ return {'ids': ids, 'len': len(ids)}
18
+
19
+ if __name__ == "__main__":
20
+ print(f"Loading streaming dataset {DATASET_NAME}...")
21
+ dataset = load_dataset(DATASET_NAME, name=SAMPLE_NAME, split='train', streaming=True)
22
+
23
+ all_tokens = []
24
+ total_tokens = 0
25
+
26
+ pbar = tqdm(total=TARGET_TOKENS, desc="Collecting tokens")
27
+ for example in dataset:
28
+ tokens = process(example)['ids']
29
+ all_tokens.extend(tokens)
30
+ total_tokens += len(tokens)
31
+ pbar.update(len(tokens))
32
+ if total_tokens >= TARGET_TOKENS:
33
+ break
34
+ pbar.close()
35
+
36
+ n = len(all_tokens)
37
+ train_data = all_tokens[:int(n*0.95)]
38
+ val_data = all_tokens[int(n*0.95):]
39
+
40
+ for name, d in [('train', train_data), ('val', val_data)]:
41
+ arr = np.array(d, dtype=np.uint16)
42
+ filename = f"{name}.bin"
43
+ arr.tofile(filename)
44
+ print(f"Saved {filename} with {len(d):,} tokens.")
45
+
46
+ print("\nDone!")