File size: 5,230 Bytes
cb31cb8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 |
# babylm_dataset.py
# author: Julie Kallini
import datasets
import os
import glob
import tqdm
from numpy.random import default_rng
from itertools import product
logger = datasets.logging.get_logger(__name__)
_DESCRIPTION = """\
Pre-tokenized BabyLM HuggingFace dataset for verb perturbations.
"""
_PERTURBED_DATA_PATH = "../data/Qwen_perturbed_data/Qwen2.5-7B"
_PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4",
"reverse_control", "reverse_partial", "reverse_full",
"shuffle_control", "shuffle_nondeterministic",
"shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84",
"shuffle_local3", "shuffle_local5", "shuffle_local10",
"shuffle_even_odd"]
# _RANDOM_SEEDS = [0, 14, 41, 53, 96]
_RANDOM_SEEDS = [0]
# _TRAIN_SETS = ["100M", "10M"]
_TRAIN_SETS = ["10M"]
_EOS_TOKEN_ID = 50256
class BabyConfig(datasets.BuilderConfig):
def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs):
"""BuilderConfig for IzParens
Args:
data_dir: path to directory of tokenized, perturbed BabyLM dataset
"""
super(BabyConfig, self).__init__(
**kwargs,
)
self.data_dir = data_dir
self.babylm_train_set = babylm_train_set
self.random_seed = random_seed
class BabyLMCorpus(datasets.GeneratorBasedBuilder):
BUILDER_CONFIGS = [
BabyConfig(
name=f"babylm_{perturbation}_{train_set}_seed{random_seed}",
data_dir=os.path.join(
_PERTURBED_DATA_PATH, "babylm_" + perturbation),
babylm_train_set=train_set,
random_seed=random_seed,
) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS))
]
def _info(self):
return datasets.DatasetInfo(
# This is the description that will appear on the datasets page.
description=_DESCRIPTION,
# datasets.features.FeatureConnectors
features=datasets.Features(
{
"text": datasets.Value("string")
# These are the features of your dataset like images, labels ...
}
),
# If there's a common (input, target) tuple from the features,
# specify them here. They'll be used if as_supervised=True in
# builder.as_dataset.
supervised_keys=None,
)
def _split_generators(self, dl_manager):
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={"data_dir": os.path.join(
self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"},
),
# datasets.SplitGenerator(
# name=datasets.Split.VALIDATION,
# gen_kwargs={"data_dir": os.path.join(
# self.config.data_dir, "babylm_dev"), "random_seed": self.config.random_seed, "split": "valid"},
#
]
def __chunk(self, sentences, eos_token):
# Tokenize each sentence
logger.info("Loading pre-tokenized data")
tokenized_sentences = []
for sent in tqdm.tqdm(sentences):
tokenized_sentences.append([int(tok) for tok in sent.split()])
# Concatenate the tokenized sentences using the EOS token
logger.info("Concatenating tokenized data using EOS token")
all_tokens = []
for tokens in tqdm.tqdm(tokenized_sentences):
all_tokens.extend(tokens)
all_tokens.append(eos_token)
# Chunk the tokens into sublists of max_seq_len tokens each
logger.info("Chunking tokens into sublists of 1024")
max_seq_len = 1024
chunked_tokens = []
for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)):
chunked_tokens.append(all_tokens[i:i + max_seq_len])
# Drop last line if not a multiple of max_seq_len
if len(chunked_tokens[-1]) < max_seq_len:
chunked_tokens.pop()
return chunked_tokens
def _generate_examples(self, data_dir, random_seed, split):
"""This function returns the BabyLM text in the discretized, tokenized form."""
logger.info("Generating examples from = %s", data_dir)
infiles = sorted(glob.glob(os.path.join(data_dir, "*")))
# Extend sentences
all_sentences = []
for infile in infiles:
f = open(infile, encoding="utf-8")
all_sentences.extend(f.readlines())
logger.info("Total sentences: {}".format(len(all_sentences)))
# Shuffle because we are pre-tokenizing
rng = default_rng(seed=random_seed)
rng.shuffle(all_sentences)
# Tokenize and chunk
tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID)
# Generate data
logger.info("Writing dataset as space-separated sequences of tokens")
for idx, line in enumerate(tokenized_lines):
l = " ".join([str(tok) for tok in line]) + "\n"
yield idx, {"text": l} |