Spaces:

lanczos
/

graphtestbed

Sleeping

graphtestbed / agents /mlevolve /runner.py

Zhu Jiajun (jz28583)

Add agents/ harness integrations and HF Space scoring deployment

d094faf 21 days ago

7.82 kB

	"""Run MLEvolve on a GraphTestbed task, routed through CLIProxyAPI.

	Usage:
	python -m agents.mlevolve.runner --task figraph
	python -m agents.mlevolve.runner --task figraph \\
	--model gpt-5.3-codex-spark --steps 100
	python -m agents.mlevolve.runner --task figraph \\
	--submit mlevolve-codex-spark

	What this does:
	1. Build an mle-bench-shaped tree from the GraphTestbed task data
	(val-as-test for v1 — see adapter.py for why).
	2. Render config.yaml into _vendor/MLEvolve/config/, with the proxy
	endpoint + model wired into agent.code and agent.feedback.
	3. Invoke `python run.py …` from inside _vendor/MLEvolve/ with Hydra
	overrides for paths and run-budget.
	4. Harvest the latest submission.csv from runs/, normalize its column
	names, validate against the testbed schema, and (optionally) submit.

	Known v1 limitation: the produced submission scores VAL-set predictions,
	not TEST-set. To score on test, rerun the best runfile.py against
	<workspace>/mlebench-tree/<task>/REAL_TEST_FEATURES.csv before submitting.
	"""

	from __future__ import annotations

	import argparse
	import os
	import subprocess
	import sys
	from pathlib import Path

	import pandas as pd

	from agents.cliproxyapi import (
	ProxyEndpoint,
	openai_yaml_block,
	wait_until_ready,
	)
	from agents.common.submit import finalize
	from agents.common.workspace import make_workspace
	from agents.mlevolve.adapter import stage as stage_mlebench
	from graphtestbed._manifest import task_config

	DEFAULT_MODEL = "gpt-5.3-codex-spark"


	def _resolve_mlevolve_dir() -> Path:
	explicit = os.environ.get("MLEVOLVE_DIR")
	if explicit:
	p = Path(explicit)
	if not (p / "run.py").exists():
	raise SystemExit(f"MLEVOLVE_DIR={p} does not contain run.py")
	return p
	vendored = Path(__file__).parent / "_vendor" / "MLEvolve"
	if (vendored / "run.py").exists():
	return vendored
	raise SystemExit(
	"Cannot locate MLEvolve.\n"
	" Install: bash agents/mlevolve/install.sh\n"
	" Or set MLEVOLVE_DIR to your existing clone."
	)


	def _hydra_overrides(
	task: str, mlebench_root: Path, prepared: Path, ep: ProxyEndpoint,
	model: str, steps: int, time_limit_s: int, num_gpus: int,
	) -> list[str]:
	"""Build Hydra-style key=value overrides for run.py."""
	public = prepared / "public"
	block = openai_yaml_block(ep, model)
	cfg_metric = task_config(task)["metric"]["primary"]

	overrides = [
	f"exp_id={task}",
	f"exp_name={task}",
	f"dataset_dir={mlebench_root}",
	f"data_dir={public}",
	f"desc_file={public / 'description.md'}",
	f"start_cpu_id=0",
	f"cpu_number=4",
	# LLM routing → proxy
	f"agent.code.model={block['model']}",
	f"agent.code.base_url={block['base_url']}",
	f"agent.code.api_key={block['api_key']}",
	f"agent.feedback.model={block['model']}",
	f"agent.feedback.base_url={block['base_url']}",
	f"agent.feedback.api_key={block['api_key']}",
	# Run budget overrides
	f"agent.steps={steps}",
	f"agent.time_limit={time_limit_s}",
	f"agent.memory_embedding_device={'cuda' if num_gpus > 0 else 'cpu'}",
	f"agent.search.num_gpus={num_gpus}",
	f"use_grading_server=false",
	# Goal hint
	f"goal=Maximize {cfg_metric} on the test set",
	f"eval={cfg_metric}",
	]
	return overrides


	def _harvest_submission(
	task: str, mlevolve_dir: Path, dst: Path,
	) -> Path:
	schema = task_config(task)["submission_schema"]
	runs = mlevolve_dir / "runs"
	if not runs.exists():
	raise SystemExit(f"No runs/ dir under {mlevolve_dir}")
	candidates = sorted(runs.rglob("submission.csv"),
	key=lambda p: p.stat().st_mtime)
	if not candidates:
	raise SystemExit(
	f"No submission.csv produced under {runs}. "
	f"Inspect {dst / 'agent.log'} for the failure mode."
	)
	chosen = candidates[-1]
	df = pd.read_csv(chosen)
	expected = [schema["id_col"], schema["pred_col"]]
	if list(df.columns) != expected:
	if len(df.columns) == 2:
	print(f" (renaming columns {list(df.columns)} → {expected})")
	df.columns = expected
	else:
	raise SystemExit(
	f"Cannot normalize {chosen}: got {list(df.columns)}, expected {expected}"
	)
	out = dst / "val_submission.csv"
	df.to_csv(out, index=False)
	print(f"✓ Picked {chosen.relative_to(mlevolve_dir)}")
	return out


	def _print_followup(task: str, ws: Path, val_sub: Path) -> None:
	real_test = ws / "mlebench-tree" / task / "REAL_TEST_FEATURES.csv"
	print()
	print("⚠ v1 limitation: the file above scores VAL predictions.")
	print(" To score on the actual test set:")
	print(f" 1. Find the best runfile.py under "
	f"{Path('_vendor/MLEvolve/runs')}/<latest>/")
	print(f" 2. Re-run it with test.csv replaced by:")
	print(f" {real_test}")
	print(f" 3. Submit the resulting CSV via:")
	print(f" gtb submit {task} --file <path> --agent <name>")


	def main() -> None:
	ap = argparse.ArgumentParser(prog="agents.mlevolve.runner")
	ap.add_argument("--task", required=True)
	ap.add_argument("--model", default=DEFAULT_MODEL,
	help=f"default: {DEFAULT_MODEL}")
	ap.add_argument("--steps", type=int, default=100,
	help="agent.steps (default: 100, upstream default 500 — "
	"MCGS exploration count)")
	ap.add_argument("--time-limit-min", type=int, default=120,
	help="agent.time_limit in minutes (default: 120)")
	ap.add_argument("--gpus", type=int, default=0,
	help="search.num_gpus (default: 0 — CPU only)")
	ap.add_argument("--submit", default=None, metavar="AGENT_ID",
	help="POST val-set submission to scoring API as this name. "
	"Note: scores VAL not test (see runner docstring).")
	ap.add_argument("--workspace-root", type=Path, default=None)
	args = ap.parse_args()

	mlevolve_dir = _resolve_mlevolve_dir()
	ep = ProxyEndpoint.from_env()
	wait_until_ready(ep)
	print(f"✓ Proxy ready at {ep.base_url()}")
	print(f"✓ MLEvolve at {mlevolve_dir}")

	ws = make_workspace("mlevolve", args.task, args.workspace_root)
	mlebench_root = ws / "mlebench-tree"
	prepared = stage_mlebench(args.task, mlebench_root)
	print(f"✓ mle-bench tree staged at {mlebench_root}")

	overrides = _hydra_overrides(
	task=args.task,
	mlebench_root=mlebench_root,
	prepared=prepared,
	ep=ep,
	model=args.model,
	steps=args.steps,
	time_limit_s=args.time_limit_min * 60,
	num_gpus=args.gpus,
	)
	cmd = [sys.executable, "run.py", *overrides]

	print(f"→ Launching MLEvolve task={args.task} model={args.model}")
	print(f" workspace: {ws}")
	log = ws / "agent.log"
	with log.open("wb") as lf:
	rc = subprocess.call(cmd, cwd=mlevolve_dir, stdout=lf, stderr=subprocess.STDOUT)
	print(f" exit={rc} log={log}")
	if rc != 0:
	raise SystemExit(rc)

	val_sub = _harvest_submission(args.task, mlevolve_dir, ws)
	_print_followup(args.task, ws, val_sub)

	# Note: don't auto-finalize against `test_features.csv` schema since this
	# is a val-set submission. Just print & stop.
	print()
	print(f" val_submission: {val_sub}")
	if args.submit:
	print(f" --submit was set; posting val-set predictions as "
	f"`{args.submit}` (will score 0 against test GT).")
	finalize(args.task, val_sub, args.submit)


	if __name__ == "__main__":
	main()