Spaces:
Sleeping
Sleeping
| """Run MLEvolve on a GraphTestbed task, routed through CLIProxyAPI. | |
| Usage: | |
| python -m agents.mlevolve.runner --task figraph | |
| python -m agents.mlevolve.runner --task figraph \\ | |
| --model gpt-5.3-codex-spark --steps 100 | |
| python -m agents.mlevolve.runner --task figraph \\ | |
| --submit mlevolve-codex-spark | |
| What this does: | |
| 1. Build an mle-bench-shaped tree from the GraphTestbed task data | |
| (val-as-test for v1 β see adapter.py for why). | |
| 2. Render config.yaml into _vendor/MLEvolve/config/, with the proxy | |
| endpoint + model wired into agent.code and agent.feedback. | |
| 3. Invoke `python run.py β¦` from inside _vendor/MLEvolve/ with Hydra | |
| overrides for paths and run-budget. | |
| 4. Harvest the latest submission.csv from runs/, normalize its column | |
| names, validate against the testbed schema, and (optionally) submit. | |
| Known v1 limitation: the produced submission scores VAL-set predictions, | |
| not TEST-set. To score on test, rerun the best runfile.py against | |
| <workspace>/mlebench-tree/<task>/REAL_TEST_FEATURES.csv before submitting. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import os | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| import pandas as pd | |
| from agents.cliproxyapi import ( | |
| ProxyEndpoint, | |
| openai_yaml_block, | |
| wait_until_ready, | |
| ) | |
| from agents.common.submit import finalize | |
| from agents.common.workspace import make_workspace | |
| from agents.mlevolve.adapter import stage as stage_mlebench | |
| from graphtestbed._manifest import task_config | |
| DEFAULT_MODEL = "gpt-5.3-codex-spark" | |
| def _resolve_mlevolve_dir() -> Path: | |
| explicit = os.environ.get("MLEVOLVE_DIR") | |
| if explicit: | |
| p = Path(explicit) | |
| if not (p / "run.py").exists(): | |
| raise SystemExit(f"MLEVOLVE_DIR={p} does not contain run.py") | |
| return p | |
| vendored = Path(__file__).parent / "_vendor" / "MLEvolve" | |
| if (vendored / "run.py").exists(): | |
| return vendored | |
| raise SystemExit( | |
| "Cannot locate MLEvolve.\n" | |
| " Install: bash agents/mlevolve/install.sh\n" | |
| " Or set MLEVOLVE_DIR to your existing clone." | |
| ) | |
| def _hydra_overrides( | |
| task: str, mlebench_root: Path, prepared: Path, ep: ProxyEndpoint, | |
| model: str, steps: int, time_limit_s: int, num_gpus: int, | |
| ) -> list[str]: | |
| """Build Hydra-style key=value overrides for run.py.""" | |
| public = prepared / "public" | |
| block = openai_yaml_block(ep, model) | |
| cfg_metric = task_config(task)["metric"]["primary"] | |
| overrides = [ | |
| f"exp_id={task}", | |
| f"exp_name={task}", | |
| f"dataset_dir={mlebench_root}", | |
| f"data_dir={public}", | |
| f"desc_file={public / 'description.md'}", | |
| f"start_cpu_id=0", | |
| f"cpu_number=4", | |
| # LLM routing β proxy | |
| f"agent.code.model={block['model']}", | |
| f"agent.code.base_url={block['base_url']}", | |
| f"agent.code.api_key={block['api_key']}", | |
| f"agent.feedback.model={block['model']}", | |
| f"agent.feedback.base_url={block['base_url']}", | |
| f"agent.feedback.api_key={block['api_key']}", | |
| # Run budget overrides | |
| f"agent.steps={steps}", | |
| f"agent.time_limit={time_limit_s}", | |
| f"agent.memory_embedding_device={'cuda' if num_gpus > 0 else 'cpu'}", | |
| f"agent.search.num_gpus={num_gpus}", | |
| f"use_grading_server=false", | |
| # Goal hint | |
| f"goal=Maximize {cfg_metric} on the test set", | |
| f"eval={cfg_metric}", | |
| ] | |
| return overrides | |
| def _harvest_submission( | |
| task: str, mlevolve_dir: Path, dst: Path, | |
| ) -> Path: | |
| schema = task_config(task)["submission_schema"] | |
| runs = mlevolve_dir / "runs" | |
| if not runs.exists(): | |
| raise SystemExit(f"No runs/ dir under {mlevolve_dir}") | |
| candidates = sorted(runs.rglob("submission.csv"), | |
| key=lambda p: p.stat().st_mtime) | |
| if not candidates: | |
| raise SystemExit( | |
| f"No submission.csv produced under {runs}. " | |
| f"Inspect {dst / 'agent.log'} for the failure mode." | |
| ) | |
| chosen = candidates[-1] | |
| df = pd.read_csv(chosen) | |
| expected = [schema["id_col"], schema["pred_col"]] | |
| if list(df.columns) != expected: | |
| if len(df.columns) == 2: | |
| print(f" (renaming columns {list(df.columns)} β {expected})") | |
| df.columns = expected | |
| else: | |
| raise SystemExit( | |
| f"Cannot normalize {chosen}: got {list(df.columns)}, expected {expected}" | |
| ) | |
| out = dst / "val_submission.csv" | |
| df.to_csv(out, index=False) | |
| print(f"β Picked {chosen.relative_to(mlevolve_dir)}") | |
| return out | |
| def _print_followup(task: str, ws: Path, val_sub: Path) -> None: | |
| real_test = ws / "mlebench-tree" / task / "REAL_TEST_FEATURES.csv" | |
| print() | |
| print("β v1 limitation: the file above scores VAL predictions.") | |
| print(" To score on the actual test set:") | |
| print(f" 1. Find the best runfile.py under " | |
| f"{Path('_vendor/MLEvolve/runs')}/<latest>/") | |
| print(f" 2. Re-run it with test.csv replaced by:") | |
| print(f" {real_test}") | |
| print(f" 3. Submit the resulting CSV via:") | |
| print(f" gtb submit {task} --file <path> --agent <name>") | |
| def main() -> None: | |
| ap = argparse.ArgumentParser(prog="agents.mlevolve.runner") | |
| ap.add_argument("--task", required=True) | |
| ap.add_argument("--model", default=DEFAULT_MODEL, | |
| help=f"default: {DEFAULT_MODEL}") | |
| ap.add_argument("--steps", type=int, default=100, | |
| help="agent.steps (default: 100, upstream default 500 β " | |
| "MCGS exploration count)") | |
| ap.add_argument("--time-limit-min", type=int, default=120, | |
| help="agent.time_limit in minutes (default: 120)") | |
| ap.add_argument("--gpus", type=int, default=0, | |
| help="search.num_gpus (default: 0 β CPU only)") | |
| ap.add_argument("--submit", default=None, metavar="AGENT_ID", | |
| help="POST val-set submission to scoring API as this name. " | |
| "Note: scores VAL not test (see runner docstring).") | |
| ap.add_argument("--workspace-root", type=Path, default=None) | |
| args = ap.parse_args() | |
| mlevolve_dir = _resolve_mlevolve_dir() | |
| ep = ProxyEndpoint.from_env() | |
| wait_until_ready(ep) | |
| print(f"β Proxy ready at {ep.base_url()}") | |
| print(f"β MLEvolve at {mlevolve_dir}") | |
| ws = make_workspace("mlevolve", args.task, args.workspace_root) | |
| mlebench_root = ws / "mlebench-tree" | |
| prepared = stage_mlebench(args.task, mlebench_root) | |
| print(f"β mle-bench tree staged at {mlebench_root}") | |
| overrides = _hydra_overrides( | |
| task=args.task, | |
| mlebench_root=mlebench_root, | |
| prepared=prepared, | |
| ep=ep, | |
| model=args.model, | |
| steps=args.steps, | |
| time_limit_s=args.time_limit_min * 60, | |
| num_gpus=args.gpus, | |
| ) | |
| cmd = [sys.executable, "run.py", *overrides] | |
| print(f"β Launching MLEvolve task={args.task} model={args.model}") | |
| print(f" workspace: {ws}") | |
| log = ws / "agent.log" | |
| with log.open("wb") as lf: | |
| rc = subprocess.call(cmd, cwd=mlevolve_dir, stdout=lf, stderr=subprocess.STDOUT) | |
| print(f" exit={rc} log={log}") | |
| if rc != 0: | |
| raise SystemExit(rc) | |
| val_sub = _harvest_submission(args.task, mlevolve_dir, ws) | |
| _print_followup(args.task, ws, val_sub) | |
| # Note: don't auto-finalize against `test_features.csv` schema since this | |
| # is a val-set submission. Just print & stop. | |
| print() | |
| print(f" val_submission: {val_sub}") | |
| if args.submit: | |
| print(f" --submit was set; posting val-set predictions as " | |
| f"`{args.submit}` (will score 0 against test GT).") | |
| finalize(args.task, val_sub, args.submit) | |
| if __name__ == "__main__": | |
| main() | |