|
import multiprocessing |
|
from itertools import chain |
|
from datasets import load_dataset |
|
from transformers import AutoTokenizer |
|
|
|
from huggingface_hub import HfApi |
|
|
|
|
|
class DatasetBuilder: |
|
def __init__( |
|
self, |
|
dataset_name, |
|
seq_len=8192, |
|
num_cpu=None, |
|
hf_account_repo=None, |
|
tokenizer="EleutherAI/gpt-neox-20b", |
|
): |
|
self.dataset_name = dataset_name |
|
self.seq_len = seq_len |
|
self.num_cpu = num_cpu or multiprocessing.cpu_count() |
|
self.hf_account_repo = hf_account_repo |
|
self.tokenizer = tokenizer |
|
|
|
def build_dataset(self): |
|
tokenizer = AutoTokenizer.from_pretrained(self.tokenizer) |
|
train_dataset = load_dataset(self.dataset_name, split="train", streaming=True) |
|
dataset = train_dataset.shuffle() |
|
|
|
def tokenize_function(example): |
|
return tokenizer([t + tokenizer.eos_token for t in example["text"]]) |
|
|
|
tokenized_dataset = dataset.map( |
|
tokenize_function, |
|
batched=True, |
|
|
|
remove_columns=["text"], |
|
|
|
) |
|
|
|
block_size = self.seq_len |
|
|
|
def group_texts(examples): |
|
concatenated_examples = { |
|
k: list(chain(*examples[k])) for k in examples.keys() |
|
} |
|
total_length = len(concatenated_examples[list(examples.keys())[0]]) |
|
|
|
if total_length >= block_size: |
|
total_length = (total_length // block_size) * block_size |
|
|
|
result = { |
|
k: [ |
|
t[i : i + block_size] |
|
for i in range(0, total_length, block_size) |
|
] |
|
for k, t in concatenated_examples.items() |
|
} |
|
|
|
return result |
|
|
|
train_tokenized_dataset = tokenized_dataset.map( |
|
group_texts, batched=True, |
|
) |
|
|
|
|
|
if self.hf_account_repo: |
|
|
|
hf_api = HfApi() |
|
hf_api.upload_file( |
|
path_or_fileobj= "TOKENIZED_DATASET", |
|
path_in_repo="README.md", |
|
repo_id=self.hf_account_repo, |
|
repo_type="dataset" |
|
) |
|
|
|
return train_tokenized_dataset |
|
|
|
|
|
|
|
builder = DatasetBuilder( |
|
dataset_name="the_pile_books3", |
|
seq_len=8192, |
|
|
|
hf_account_repo="kye/thepilebooks3-gptneox-8k", |
|
tokenizer="EleutherAI/gpt-neox-20b", |
|
) |
|
|
|
dataset = builder.build_dataset() |
|
|