|
|
|
|
|
|
|
import datasets |
|
import os |
|
import glob |
|
import tqdm |
|
from numpy.random import default_rng |
|
from itertools import product |
|
|
|
logger = datasets.logging.get_logger(__name__) |
|
|
|
_DESCRIPTION = """\ |
|
Pre-tokenized BabyLM HuggingFace dataset for verb perturbations. |
|
""" |
|
_PERTURBED_DATA_PATH = "../data/Qwen_perturbed_data/Qwen2.5-7B" |
|
_PERTURBATIONS = ["hop_control", "hop_tokens4", "hop_words4", |
|
"reverse_control", "reverse_partial", "reverse_full", |
|
"shuffle_control", "shuffle_nondeterministic", |
|
"shuffle_deterministic21", "shuffle_deterministic57", "shuffle_deterministic84", |
|
"shuffle_local3", "shuffle_local5", "shuffle_local10", |
|
"shuffle_even_odd"] |
|
|
|
_RANDOM_SEEDS = [0] |
|
|
|
_TRAIN_SETS = ["10M"] |
|
_EOS_TOKEN_ID = 50256 |
|
|
|
|
|
class BabyConfig(datasets.BuilderConfig): |
|
|
|
def __init__(self, data_dir, babylm_train_set, random_seed, **kwargs): |
|
"""BuilderConfig for IzParens |
|
|
|
Args: |
|
data_dir: path to directory of tokenized, perturbed BabyLM dataset |
|
""" |
|
super(BabyConfig, self).__init__( |
|
**kwargs, |
|
) |
|
self.data_dir = data_dir |
|
self.babylm_train_set = babylm_train_set |
|
self.random_seed = random_seed |
|
|
|
|
|
class BabyLMCorpus(datasets.GeneratorBasedBuilder): |
|
BUILDER_CONFIGS = [ |
|
BabyConfig( |
|
name=f"babylm_{perturbation}_{train_set}_seed{random_seed}", |
|
data_dir=os.path.join( |
|
_PERTURBED_DATA_PATH, "babylm_" + perturbation), |
|
babylm_train_set=train_set, |
|
random_seed=random_seed, |
|
) for perturbation, train_set, random_seed in list(product(_PERTURBATIONS, _TRAIN_SETS, _RANDOM_SEEDS)) |
|
] |
|
|
|
def _info(self): |
|
return datasets.DatasetInfo( |
|
|
|
description=_DESCRIPTION, |
|
|
|
features=datasets.Features( |
|
{ |
|
"text": datasets.Value("string") |
|
|
|
} |
|
), |
|
|
|
|
|
|
|
supervised_keys=None, |
|
) |
|
|
|
def _split_generators(self, dl_manager): |
|
return [ |
|
datasets.SplitGenerator( |
|
name=datasets.Split.TRAIN, |
|
gen_kwargs={"data_dir": os.path.join( |
|
self.config.data_dir, "babylm_" + self.config.babylm_train_set), "random_seed": self.config.random_seed, "split": "train"}, |
|
), |
|
|
|
|
|
|
|
|
|
|
|
] |
|
|
|
def __chunk(self, sentences, eos_token): |
|
|
|
|
|
logger.info("Loading pre-tokenized data") |
|
tokenized_sentences = [] |
|
for sent in tqdm.tqdm(sentences): |
|
tokenized_sentences.append([int(tok) for tok in sent.split()]) |
|
|
|
|
|
logger.info("Concatenating tokenized data using EOS token") |
|
all_tokens = [] |
|
for tokens in tqdm.tqdm(tokenized_sentences): |
|
all_tokens.extend(tokens) |
|
all_tokens.append(eos_token) |
|
|
|
|
|
logger.info("Chunking tokens into sublists of 1024") |
|
max_seq_len = 1024 |
|
chunked_tokens = [] |
|
for i in tqdm.tqdm(range(0, len(all_tokens), max_seq_len)): |
|
chunked_tokens.append(all_tokens[i:i + max_seq_len]) |
|
|
|
|
|
if len(chunked_tokens[-1]) < max_seq_len: |
|
chunked_tokens.pop() |
|
|
|
return chunked_tokens |
|
|
|
def _generate_examples(self, data_dir, random_seed, split): |
|
"""This function returns the BabyLM text in the discretized, tokenized form.""" |
|
|
|
logger.info("Generating examples from = %s", data_dir) |
|
infiles = sorted(glob.glob(os.path.join(data_dir, "*"))) |
|
|
|
|
|
all_sentences = [] |
|
for infile in infiles: |
|
f = open(infile, encoding="utf-8") |
|
all_sentences.extend(f.readlines()) |
|
logger.info("Total sentences: {}".format(len(all_sentences))) |
|
|
|
|
|
rng = default_rng(seed=random_seed) |
|
rng.shuffle(all_sentences) |
|
|
|
|
|
tokenized_lines = self.__chunk(all_sentences, _EOS_TOKEN_ID) |
|
|
|
|
|
logger.info("Writing dataset as space-separated sequences of tokens") |
|
for idx, line in enumerate(tokenized_lines): |
|
l = " ".join([str(tok) for tok in line]) + "\n" |
|
yield idx, {"text": l} |