jrahn commited on
Commit
7f55685
1 Parent(s): f227cda

Upload edu_fineweb_hermes.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. edu_fineweb_hermes.py +168 -0
edu_fineweb_hermes.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ FineWeb dataset (for srs pretraining)
3
+ https://huggingface.co/datasets/HuggingFaceFW/fineweb
4
+
5
+ example doc to highlight the structure of the dataset:
6
+ {
7
+ "text": "Posted by mattsmith on 20th April 2012\nStraight from...",
8
+ "id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>",
9
+ "dump": "CC-MAIN-2013-20",
10
+ "url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/",
11
+ "date": "2013-05-18T07:24:47Z",
12
+ "file_path": "s3://commoncrawl/long.../path.../file.gz",
13
+ "language": "en",
14
+ "language_score": 0.9185474514961243,
15
+ "token_count": 594
16
+ }
17
+
18
+ Example of downloading the 100B dataset of FineWebEDU, from root directory:
19
+ python dev/data/fineweb.py -t edu -v 100B
20
+ 100B runs for small few hours, depending on your internet and computer.
21
+ """
22
+ import os
23
+ import argparse
24
+ import multiprocessing as mp
25
+ import numpy as np
26
+ import tiktoken
27
+ from datasets import load_dataset, interleave_datasets, concatenate_datasets
28
+ from tqdm import tqdm
29
+ import argparse
30
+
31
+ from data_common import write_datafile
32
+ # ------------------------------------------
33
+
34
+ from transformers import AutoTokenizer
35
+
36
+ tkc = AutoTokenizer.from_pretrained("gpt2")
37
+ tkc.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
38
+
39
+ def sharegpt_to_chatml(example):
40
+ chatml_conversations = []
41
+ for conv in example["conversations"]:
42
+ if conv["from"] == "human":
43
+ role = "user"
44
+ elif conv["from"] == "system":
45
+ role = "system"
46
+ elif conv["from"] == "gpt":
47
+ role = "assistant"
48
+ else:
49
+ role = "user"
50
+ chatml_format = {"role": role, "content": conv["value"]}
51
+ chatml_conversations.append(chatml_format)
52
+ formatted = tkc.apply_chat_template(chatml_conversations, tokenize=False, add_generation_prompt=False)
53
+ return {"text": formatted}
54
+ # ------------------------------------------
55
+
56
+ parser = argparse.ArgumentParser(description="FineWeb and Edu-FineWeb dataset preprocessing")
57
+ parser.add_argument("-t", "--type", type=str, default="edu", help="Fineweb type, edu|classic")
58
+ parser.add_argument("-v", "--version", type=str, default="10B", help="Fineweb data sample size, 10B|100B")
59
+ parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each data shard in the output .bin files, in tokens")
60
+ args = parser.parse_args()
61
+
62
+ # FineWeb has a few possible subsamples available
63
+ assert args.version in {"10B", "100B"}, "version must be one of: 10B, 100B"
64
+ assert args.type in {"edu", "classic"}, "type must be one of: edu, classic"
65
+ directories = {
66
+ ("classic", "10B"): ("fineweb10B_hermes", "sample-10BT"),
67
+ ("classic", "100B"): ("fineweb100B_hermes", "sample-100BT"),
68
+ ("edu", "10B"): ("edu_fineweb10B_hermes", "sample-10BT"),
69
+ ("edu", "100B"): ("edu_fineweb100B_hermes", "sample-100BT")
70
+ }
71
+ local_dir, remote_name = directories[(args.type, args.version)]
72
+
73
+ # create the cache the local directory if it doesn't exist yet
74
+ DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
75
+ os.makedirs(DATA_CACHE_DIR, exist_ok=True)
76
+
77
+ # download the dataset
78
+ if args.type == "classic":
79
+ fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train")
80
+ name = "fineweb_hermes"
81
+ elif args.type =="edu":
82
+ fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
83
+ name = "edu_fineweb_hermes"
84
+
85
+ oh = load_dataset("teknium/OpenHermes-2.5", split="train")
86
+ oh = oh.map(sharegpt_to_chatml)
87
+ oh = oh.select_columns(["text"])
88
+
89
+ fw_1 = fw.select(range(len(fw)//2))
90
+ fw_2 = fw.select(range(len(fw)//2, len(fw)-500_000))
91
+ fw_3 = fw.select(range(len(fw)-500_000, len(fw)))
92
+ oh_1 = oh.select(range(100_000))
93
+ oh_2 = oh.select(range(100_000, 500_000))
94
+ oh_3 = oh.select(range(500_000, len(oh)))
95
+
96
+ p1 = len(fw_1) / (len(fw_1) + len(oh_1))
97
+ p2 = len(fw_2) / (len(fw_2) + len(oh_2))
98
+ p3 = len(fw_3) / (len(fw_3) + len(oh_3))
99
+
100
+ # interleaving the two datasets
101
+ ds = concatenate_datasets([
102
+ interleave_datasets([fw_1, oh_1], probabilities=[p1, 1-p1]),
103
+ interleave_datasets([fw_2, oh_2], probabilities=[p2, 1-p2]),
104
+ interleave_datasets([fw_3, oh_3], probabilities=[p3, 1-p3])
105
+ ])
106
+ print(f"Dataset proportions:")
107
+ print(f"Part 1: FWE {len(fw_1):,} + OH {len(oh_1):,} ({1-p1:.2%}) = {len(fw_1) + len(oh_1):,}")
108
+ print(f"Part 2: FWE {len(fw_2):,} + OH {len(oh_2):,} ({1-p2:.2%}) = {len(fw_2) + len(oh_2):,}")
109
+ print(f"Part 3: FWE {len(fw_3):,} + OH {len(oh_3):,} ({1-p3:.2%}) = {len(fw_3) + len(oh_3):,}")
110
+ print(f"Total documents: {len(ds):,}")
111
+
112
+ #Dataset proportions:
113
+ #Part 1: FWE 4,836,050 + OH 100,000 (2.03%) = 4,936,050
114
+ #Part 2: FWE 4,336,051 + OH 400,000 (8.45%) = 4,736,051
115
+ #Part 3: FWE 500,000 + OH 501,551 (50.08%) = 1,001,551
116
+ #Total documents: 10,669,024
117
+
118
+ # init the tokenizer
119
+ enc = tiktoken.get_encoding("gpt2")
120
+ eot = enc._special_tokens['<|endoftext|>'] # end of text token
121
+ def tokenize(doc):
122
+ # tokenizes a single document and returns a numpy array of uint16 tokens
123
+ tokens = [eot] # the special <|endoftext|> token delimits all documents
124
+ tokens.extend(enc.encode_ordinary(doc["text"]))
125
+ tokens_np = np.array(tokens)
126
+ assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
127
+ tokens_np_uint16 = tokens_np.astype(np.uint16)
128
+ return tokens_np_uint16
129
+
130
+ # tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
131
+ nprocs = max(1, os.cpu_count() - 2) # don't hog the entire system
132
+ with mp.Pool(nprocs) as pool:
133
+ shard_index = 0
134
+ # preallocate buffer to hold current shard
135
+ all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
136
+ token_count = 0
137
+ progress_bar = None
138
+ for tokens in pool.imap(tokenize, ds, chunksize=16):
139
+
140
+ # is there enough space in the current shard for the new tokens?
141
+ if token_count + len(tokens) < args.shard_size:
142
+ # simply append tokens to current shard
143
+ all_tokens_np[token_count:token_count+len(tokens)] = tokens
144
+ token_count += len(tokens)
145
+ # update progress bar
146
+ if progress_bar is None:
147
+ progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
148
+ progress_bar.update(len(tokens))
149
+ else:
150
+ # write the current shard and start a new one
151
+ split = "val" if shard_index == 0 else "train"
152
+ filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
153
+ # split the document into whatever fits in this shard; the remainder goes to next one
154
+ remainder = args.shard_size - token_count
155
+ progress_bar.update(remainder)
156
+ all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
157
+ write_datafile(filename, all_tokens_np)
158
+ shard_index += 1
159
+ progress_bar = None
160
+ # populate the next shard with the leftovers of the current doc
161
+ all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
162
+ token_count = len(tokens)-remainder
163
+
164
+ # write any remaining tokens as the last shard
165
+ if token_count != 0:
166
+ split = "val" if shard_index == 0 else "train"
167
+ filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
168
+ write_datafile(filename, all_tokens_np[:token_count])