Spaces:

Bordoglor
/

Ramzes_diffusion_nn

Sleeping

App Files Files Community

Ramzes_diffusion_nn / peft /method_comparison /MetaMathQA /data.py

Bordoglor

Upload folder using huggingface_hub

92c1c00 verified about 2 months ago

raw

history blame contribute delete

4.7 kB

	# Copyright 2025-present the HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""
	All utilities related to data handling.
	"""

	from functools import partial
	from typing import Callable

	import datasets
	import numpy as np
	from datasets import Dataset, load_dataset


	# with a token limit of 768 for query + response, we have to exclude all texts with length > 1304; this leaves 93.8% of
	# the dataset
	CHAR_LIMIT = 1300
	# train/valid/test split -- note that evaluation takes quite long, so don't choose too large sizes for the valid set,
	# since it's run multiple times during training; test is only run once at the end and thus can be larger
	VALID_SIZE = 50


	def get_filtered_dataset(*, ds: datasets.Dataset, print_fn: Callable[..., None]) -> Dataset:
	"""Return the filtered dataset, with long queries removed.

	We determined that 99% of queries have 529 or fewer characters. Characters roughly correspond to tokens, so this is
	a good proxy. We cannot use tokens directly, as that depends on the tokenizer, which can be different for each
	model, but we want the same filter for each model.

	"""
	char_lengths = [len(f"{q} {r}") for q, r in zip(ds["query"], ds["response"])]
	idx_filtered = [i for i, length in enumerate(char_lengths) if length <= CHAR_LIMIT]
	print_fn(f"Filtered dataset: {100 * len(idx_filtered) / len(ds):.1f}% of the original dataset")
	return ds.select(idx_filtered)


	def get_train_valid_test_datasets(
	*, tokenizer, query_template: str, print_fn: Callable[..., None]
	) -> tuple[Dataset, Dataset, Dataset]:
	"""
	Return the indices of the train, valid, and test splits of the dataset.

	We cannot use ds.train_test_split(..., stratify_by_column="type") as it gives:

	> ValueError: Stratifying by column is only supported for ClassLabel column, and column type is Value.

	even after calling ds_filtered.class_encode_column("type"). Thus, using sklearn's StratifiedKFold instead.
	"""
	metamath = load_dataset("meta-math/MetaMathQA")["train"]
	metamath = get_filtered_dataset(ds=metamath, print_fn=print_fn)

	# gsmk8k does not need to be filtered as query and response are short enough
	gsm8k = load_dataset("openai/gsm8k", "main")
	gsm8k = gsm8k.rename_columns({"question": "query", "answer": "response"})
	gsm8k_train = gsm8k["train"]
	gsm8k_test = gsm8k["test"]

	np.random.seed(0)
	indices = np.arange(len(gsm8k_train))
	np.random.shuffle(indices)
	idx_valid = indices[:VALID_SIZE]

	ds_train = metamath
	ds_valid = gsm8k_train.select(idx_valid)
	ds_test = gsm8k_test

	print_fn(f"Train size: {len(ds_train)}")
	print_fn(f"Valid size: {len(ds_valid)}")
	print_fn(f"Test size: {len(ds_test)}")

	tokenize_with_answer_ = partial(tokenize_with_answer, tokenizer=tokenizer, template=query_template)
	tokenize_wo_answer_ = partial(tokenize_wo_answer, tokenizer=tokenizer, template=query_template)
	ds_train = ds_train.map(tokenize_with_answer_, batched=True).remove_columns(["type", "query", "original_question"])
	ds_valid = ds_valid.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])
	ds_test = ds_test.map(tokenize_wo_answer_, batched=True).remove_columns(["query"])

	return ds_train, ds_valid, ds_test


	def tokenize_with_answer(samples, tokenizer, template):
	queries = [template.format(query=sample) + answer for sample, answer in zip(samples["query"], samples["response"])]
	tokenized = tokenizer(queries)
	tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
	tokenized["attention_mask"] = [
	input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
	]
	return tokenized


	def tokenize_wo_answer(samples, tokenizer, template):
	queries = [template.format(query=sample) for sample in samples["query"]]
	tokenized = tokenizer(queries)
	tokenized["input_ids"] = [input_ids[: tokenizer.model_max_length] for input_ids in tokenized["input_ids"]]
	tokenized["attention_mask"] = [
	input_ids[: tokenizer.model_max_length] for input_ids in tokenized["attention_mask"]
	]
	return tokenized