Upload edu_fineweb_hermes.py with huggingface_hub
Browse files- edu_fineweb_hermes.py +168 -0
edu_fineweb_hermes.py
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
FineWeb dataset (for srs pretraining)
|
3 |
+
https://huggingface.co/datasets/HuggingFaceFW/fineweb
|
4 |
+
|
5 |
+
example doc to highlight the structure of the dataset:
|
6 |
+
{
|
7 |
+
"text": "Posted by mattsmith on 20th April 2012\nStraight from...",
|
8 |
+
"id": "<urn:uuid:d853d453-196e-4488-a411-efc2b26c40d2>",
|
9 |
+
"dump": "CC-MAIN-2013-20",
|
10 |
+
"url": "http://nleastchatter.com/philliesphandom/tag/freddy-galvis/",
|
11 |
+
"date": "2013-05-18T07:24:47Z",
|
12 |
+
"file_path": "s3://commoncrawl/long.../path.../file.gz",
|
13 |
+
"language": "en",
|
14 |
+
"language_score": 0.9185474514961243,
|
15 |
+
"token_count": 594
|
16 |
+
}
|
17 |
+
|
18 |
+
Example of downloading the 100B dataset of FineWebEDU, from root directory:
|
19 |
+
python dev/data/fineweb.py -t edu -v 100B
|
20 |
+
100B runs for small few hours, depending on your internet and computer.
|
21 |
+
"""
|
22 |
+
import os
|
23 |
+
import argparse
|
24 |
+
import multiprocessing as mp
|
25 |
+
import numpy as np
|
26 |
+
import tiktoken
|
27 |
+
from datasets import load_dataset, interleave_datasets, concatenate_datasets
|
28 |
+
from tqdm import tqdm
|
29 |
+
import argparse
|
30 |
+
|
31 |
+
from data_common import write_datafile
|
32 |
+
# ------------------------------------------
|
33 |
+
|
34 |
+
from transformers import AutoTokenizer
|
35 |
+
|
36 |
+
tkc = AutoTokenizer.from_pretrained("gpt2")
|
37 |
+
tkc.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
|
38 |
+
|
39 |
+
def sharegpt_to_chatml(example):
|
40 |
+
chatml_conversations = []
|
41 |
+
for conv in example["conversations"]:
|
42 |
+
if conv["from"] == "human":
|
43 |
+
role = "user"
|
44 |
+
elif conv["from"] == "system":
|
45 |
+
role = "system"
|
46 |
+
elif conv["from"] == "gpt":
|
47 |
+
role = "assistant"
|
48 |
+
else:
|
49 |
+
role = "user"
|
50 |
+
chatml_format = {"role": role, "content": conv["value"]}
|
51 |
+
chatml_conversations.append(chatml_format)
|
52 |
+
formatted = tkc.apply_chat_template(chatml_conversations, tokenize=False, add_generation_prompt=False)
|
53 |
+
return {"text": formatted}
|
54 |
+
# ------------------------------------------
|
55 |
+
|
56 |
+
parser = argparse.ArgumentParser(description="FineWeb and Edu-FineWeb dataset preprocessing")
|
57 |
+
parser.add_argument("-t", "--type", type=str, default="edu", help="Fineweb type, edu|classic")
|
58 |
+
parser.add_argument("-v", "--version", type=str, default="10B", help="Fineweb data sample size, 10B|100B")
|
59 |
+
parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each data shard in the output .bin files, in tokens")
|
60 |
+
args = parser.parse_args()
|
61 |
+
|
62 |
+
# FineWeb has a few possible subsamples available
|
63 |
+
assert args.version in {"10B", "100B"}, "version must be one of: 10B, 100B"
|
64 |
+
assert args.type in {"edu", "classic"}, "type must be one of: edu, classic"
|
65 |
+
directories = {
|
66 |
+
("classic", "10B"): ("fineweb10B_hermes", "sample-10BT"),
|
67 |
+
("classic", "100B"): ("fineweb100B_hermes", "sample-100BT"),
|
68 |
+
("edu", "10B"): ("edu_fineweb10B_hermes", "sample-10BT"),
|
69 |
+
("edu", "100B"): ("edu_fineweb100B_hermes", "sample-100BT")
|
70 |
+
}
|
71 |
+
local_dir, remote_name = directories[(args.type, args.version)]
|
72 |
+
|
73 |
+
# create the cache the local directory if it doesn't exist yet
|
74 |
+
DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
|
75 |
+
os.makedirs(DATA_CACHE_DIR, exist_ok=True)
|
76 |
+
|
77 |
+
# download the dataset
|
78 |
+
if args.type == "classic":
|
79 |
+
fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train")
|
80 |
+
name = "fineweb_hermes"
|
81 |
+
elif args.type =="edu":
|
82 |
+
fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
|
83 |
+
name = "edu_fineweb_hermes"
|
84 |
+
|
85 |
+
oh = load_dataset("teknium/OpenHermes-2.5", split="train")
|
86 |
+
oh = oh.map(sharegpt_to_chatml)
|
87 |
+
oh = oh.select_columns(["text"])
|
88 |
+
|
89 |
+
fw_1 = fw.select(range(len(fw)//2))
|
90 |
+
fw_2 = fw.select(range(len(fw)//2, len(fw)-500_000))
|
91 |
+
fw_3 = fw.select(range(len(fw)-500_000, len(fw)))
|
92 |
+
oh_1 = oh.select(range(100_000))
|
93 |
+
oh_2 = oh.select(range(100_000, 500_000))
|
94 |
+
oh_3 = oh.select(range(500_000, len(oh)))
|
95 |
+
|
96 |
+
p1 = len(fw_1) / (len(fw_1) + len(oh_1))
|
97 |
+
p2 = len(fw_2) / (len(fw_2) + len(oh_2))
|
98 |
+
p3 = len(fw_3) / (len(fw_3) + len(oh_3))
|
99 |
+
|
100 |
+
# interleaving the two datasets
|
101 |
+
ds = concatenate_datasets([
|
102 |
+
interleave_datasets([fw_1, oh_1], probabilities=[p1, 1-p1]),
|
103 |
+
interleave_datasets([fw_2, oh_2], probabilities=[p2, 1-p2]),
|
104 |
+
interleave_datasets([fw_3, oh_3], probabilities=[p3, 1-p3])
|
105 |
+
])
|
106 |
+
print(f"Dataset proportions:")
|
107 |
+
print(f"Part 1: FWE {len(fw_1):,} + OH {len(oh_1):,} ({1-p1:.2%}) = {len(fw_1) + len(oh_1):,}")
|
108 |
+
print(f"Part 2: FWE {len(fw_2):,} + OH {len(oh_2):,} ({1-p2:.2%}) = {len(fw_2) + len(oh_2):,}")
|
109 |
+
print(f"Part 3: FWE {len(fw_3):,} + OH {len(oh_3):,} ({1-p3:.2%}) = {len(fw_3) + len(oh_3):,}")
|
110 |
+
print(f"Total documents: {len(ds):,}")
|
111 |
+
|
112 |
+
#Dataset proportions:
|
113 |
+
#Part 1: FWE 4,836,050 + OH 100,000 (2.03%) = 4,936,050
|
114 |
+
#Part 2: FWE 4,336,051 + OH 400,000 (8.45%) = 4,736,051
|
115 |
+
#Part 3: FWE 500,000 + OH 501,551 (50.08%) = 1,001,551
|
116 |
+
#Total documents: 10,669,024
|
117 |
+
|
118 |
+
# init the tokenizer
|
119 |
+
enc = tiktoken.get_encoding("gpt2")
|
120 |
+
eot = enc._special_tokens['<|endoftext|>'] # end of text token
|
121 |
+
def tokenize(doc):
|
122 |
+
# tokenizes a single document and returns a numpy array of uint16 tokens
|
123 |
+
tokens = [eot] # the special <|endoftext|> token delimits all documents
|
124 |
+
tokens.extend(enc.encode_ordinary(doc["text"]))
|
125 |
+
tokens_np = np.array(tokens)
|
126 |
+
assert (0 <= tokens_np).all() and (tokens_np < 2**16).all(), "token dictionary too large for uint16"
|
127 |
+
tokens_np_uint16 = tokens_np.astype(np.uint16)
|
128 |
+
return tokens_np_uint16
|
129 |
+
|
130 |
+
# tokenize all documents and write output shards, each of shard_size tokens (last shard has remainder)
|
131 |
+
nprocs = max(1, os.cpu_count() - 2) # don't hog the entire system
|
132 |
+
with mp.Pool(nprocs) as pool:
|
133 |
+
shard_index = 0
|
134 |
+
# preallocate buffer to hold current shard
|
135 |
+
all_tokens_np = np.empty((args.shard_size,), dtype=np.uint16)
|
136 |
+
token_count = 0
|
137 |
+
progress_bar = None
|
138 |
+
for tokens in pool.imap(tokenize, ds, chunksize=16):
|
139 |
+
|
140 |
+
# is there enough space in the current shard for the new tokens?
|
141 |
+
if token_count + len(tokens) < args.shard_size:
|
142 |
+
# simply append tokens to current shard
|
143 |
+
all_tokens_np[token_count:token_count+len(tokens)] = tokens
|
144 |
+
token_count += len(tokens)
|
145 |
+
# update progress bar
|
146 |
+
if progress_bar is None:
|
147 |
+
progress_bar = tqdm(total=args.shard_size, unit="tokens", desc=f"Shard {shard_index}")
|
148 |
+
progress_bar.update(len(tokens))
|
149 |
+
else:
|
150 |
+
# write the current shard and start a new one
|
151 |
+
split = "val" if shard_index == 0 else "train"
|
152 |
+
filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
|
153 |
+
# split the document into whatever fits in this shard; the remainder goes to next one
|
154 |
+
remainder = args.shard_size - token_count
|
155 |
+
progress_bar.update(remainder)
|
156 |
+
all_tokens_np[token_count:token_count+remainder] = tokens[:remainder]
|
157 |
+
write_datafile(filename, all_tokens_np)
|
158 |
+
shard_index += 1
|
159 |
+
progress_bar = None
|
160 |
+
# populate the next shard with the leftovers of the current doc
|
161 |
+
all_tokens_np[0:len(tokens)-remainder] = tokens[remainder:]
|
162 |
+
token_count = len(tokens)-remainder
|
163 |
+
|
164 |
+
# write any remaining tokens as the last shard
|
165 |
+
if token_count != 0:
|
166 |
+
split = "val" if shard_index == 0 else "train"
|
167 |
+
filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
|
168 |
+
write_datafile(filename, all_tokens_np[:token_count])
|