Spaces:

onewayto
/

water

Sleeping

App Files Files Community

water / eval /create_eval_dataset.py

onewayto

Upload 102 files

de93e67 verified about 1 month ago

raw

history blame contribute delete

5.27 kB

	from itertools import product

	from datasets import Dataset

	# Task templates (excluding Very hard difficulty)
	tasks = [
	{
	"task": "Evaluate models {M} on benchmarks {B}",
	"difficulty": "Easy",
	"category": "Evaluation",
	"params": ["M", "B"],
	},
	{
	"task": "Train models {M} on datasets {D} evaluating them on benchmarks {B}",
	"difficulty": "Medium",
	"category": "Training",
	"params": ["M", "D", "B"],
	},
	{
	"task": "Run an ablation for hyperparameter {P} for model {M} on dataset {D}",
	"difficulty": "Hard",
	"category": "Ablation",
	"params": ["P", "M", "D"],
	},
	{
	"task": "Generate completions with model {M} on benchmarks {B} using engine {E}",
	"difficulty": "Medium",
	"category": "Generation",
	"params": ["M", "B", "E"],
	},
	# {
	# "task": "Merge models {M} using linear averaging to find the best result on benchmarks {B}",
	# "difficulty": "Hard",
	# "category": "Model Merging",
	# "params": ["M", "B"],
	# },
	{
	"task": "Decontaminate dataset {D} against benchmarks {B}",
	"difficulty": "Hard",
	"category": "Data Processing",
	"params": ["D", "B"],
	},
	{
	"task": "Format dataset {D} for compatibility with framework {F} on task {T}",
	"difficulty": "Easy",
	"category": "Data Formatting",
	"params": ["D", "F", "T"],
	},
	]

	# Parameter values
	values = {
	"M": [
	"Qwen/Qwen3-4B-Instruct-2507",
	"openai/gpt-oss-20b",
	"gpt-4o-mini",
	"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
	"anthropic's latest model",
	],
	"B": [
	"Idavidrein/gpqa",
	"HuggingFaceH4/MATH-500",
	"lighteval/SimpleQA",
	"TIGER-Lab/MMLU-Pro",
	],
	"D": [
	"HuggingFaceH4/multi_turn_if",
	"HuggingFaceH4/ultrachat_200k",
	"HuggingFaceH4/AceReason-1.1-SFT config: math_no_think",
	],
	"E": [
	"vllm",
	"sglang",
	],
	"F": [
	"trl",
	"axolotl",
	"verl",
	],
	"P": [
	"learning_rate",
	"batch_size",
	"num_epochs",
	],
	"T": [
	"SFT",
	"GRPO",
	],
	}

	# Task-specific instance limits
	# For each task, specify which parameter(s) to pivot on and how many instances per pivot combination
	# pivot can be a single parameter string or a list of parameters
	task_limits = [
	{"pivot": "B", "instances_per_pivot": 1}, # Task 0: 1 instance per
	{"pivot": ["M", "B"], "instances_per_pivot": 3}, # Task 1: 3 instances per model
	{"pivot": ["P", "D"], "instances_per_pivot": 3}, # Task 2:
	{"pivot": "E", "instances_per_pivot": 2}, # Task 3: 2 instances per benchmark
	# {"pivot": "M", "instances_per_pivot": 2}, # Task 4
	{"pivot": "D", "instances_per_pivot": 2}, # Task 5: 2 instances per dataset
	{"pivot": ["D", "F", "T"], "instances_per_pivot": 2}, # Task 6:
	]


	def main():
	eval_data = []

	for task_idx, task_dict in enumerate(tasks):
	template = task_dict["task"]
	params = task_dict["params"]
	limit_config = task_limits[task_idx]

	pivot_params = limit_config["pivot"]
	instances_per_pivot = limit_config["instances_per_pivot"]

	# Normalize pivot to list
	if isinstance(pivot_params, str):
	pivot_params = [pivot_params]

	# Get all combinations of pivot values
	pivot_param_values = [values[p] for p in pivot_params]
	pivot_combinations = product(*pivot_param_values)

	# For each pivot combination, generate limited instances
	for pivot_combo in pivot_combinations:
	# Get combinations of other (non-pivot) parameters
	other_params = [p for p in params if p not in pivot_params]
	other_param_values = [values[p] for p in other_params]
	other_combinations = list(product(*other_param_values))

	# Limit to specified number of instances per pivot combination
	limited_combinations = other_combinations[:instances_per_pivot]

	# Generate instances
	for combo in limited_combinations:
	# Build kwargs with pivot values and other values
	kwargs = dict(zip(pivot_params, pivot_combo))
	kwargs.update(dict(zip(other_params, combo)))

	concrete_task = template.format(**kwargs)
	eval_data.append(
	{
	"task": concrete_task,
	"difficulty": task_dict["difficulty"],
	"category": task_dict["category"],
	}
	)

	print(f"Generated {len(eval_data)} instances from {len(tasks)} templates")

	dataset = Dataset.from_list(eval_data)
	print(f"\nDataset: {len(dataset)} rows")
	print(f"Sample: {dataset[0]['task']}")

	dataset.push_to_hub("akseljoonas/qyestions", private=False)
	print("\n✓ Pushed to akseljoonas/qyestions")


	if __name__ == "__main__":
	main()