JustinTX commited on 22 days ago

Commit

730e01e

verified ·

1 Parent(s): bc9b4d5

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

benchmarks/ADRS/eplb/README.md +63 -0
benchmarks/ADRS/eplb/initial_program.py +238 -0
benchmarks/ADRS/llm_sql/evaluator/utils.py +81 -0
benchmarks/ADRS/prism/config.yaml +24 -0
benchmarks/ADRS/prism/evaluator/Dockerfile +13 -0
benchmarks/ADRS/prism/evaluator/evaluator.py +259 -0
benchmarks/ADRS/prism/evaluator/requirements.txt +1 -0
benchmarks/ADRS/prism/evaluator/wrapper.py +98 -0
benchmarks/arc_benchmark/config.yaml +51 -0
benchmarks/arc_benchmark/convert_arc_agi2_data.py +63 -0
benchmarks/arc_benchmark/evaluator/Dockerfile +13 -0
benchmarks/arc_benchmark/evaluator/evaluate.sh +7 -0
benchmarks/arc_benchmark/evaluator/evaluator.py +407 -0
benchmarks/arc_benchmark/evaluator/requirements.txt +1 -0
benchmarks/arc_benchmark/evaluator/wrapper.py +98 -0
benchmarks/arc_benchmark/generate_config.py +101 -0
benchmarks/arc_benchmark/initial_program.py +42 -0
benchmarks/arc_benchmark/post_discovery_eval.py +157 -0
benchmarks/frontier-cs-eval/README.md +72 -0
benchmarks/frontier-cs-eval/analyze_results.py +105 -0
benchmarks/frontier-cs-eval/combine_results.py +66 -0
benchmarks/frontier-cs-eval/config.yaml +57 -0
benchmarks/frontier-cs-eval/evaluator.py +174 -0
benchmarks/frontier-cs-eval/initial_program.cpp +6 -0
benchmarks/frontier-cs-eval/run_all_frontiercs.py +70 -0
benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py +404 -0
benchmarks/image_gen/README.md +40 -0
benchmarks/image_gen/sky_festival/config.yaml +103 -0
benchmarks/image_gen/sky_festival/evaluator.py +220 -0
benchmarks/math/circle_packing_rect/evaluator/evaluator.py +119 -0
benchmarks/math/erdos_min_overlap/config.yaml +41 -0
benchmarks/math/erdos_min_overlap/evaluator/Dockerfile +13 -0
benchmarks/math/erdos_min_overlap/evaluator/requirements.txt +3 -0
benchmarks/math/erdos_min_overlap/initial_program.py +96 -0
benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh +7 -0
benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py +98 -0
benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh +7 -0
benchmarks/math/matmul/evaluator/Dockerfile +13 -0
benchmarks/math/matmul/evaluator/evaluate.sh +7 -0
benchmarks/math/matmul/evaluator/evaluator.py +115 -0
benchmarks/math/matmul/evaluator/requirements.txt +3 -0
benchmarks/math/matmul/evaluator/wrapper.py +98 -0
benchmarks/math/matmul/initial_program.py +199 -0
benchmarks/math/minimizing_max_min_dist/2/config.yaml +29 -0
benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile +13 -0
benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh +7 -0
benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py +78 -0
benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt +2 -0
benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py +98 -0
benchmarks/math/minimizing_max_min_dist/2/initial_program.py +24 -0

benchmarks/ADRS/eplb/README.md ADDED Viewed

	@@ -0,0 +1,63 @@

+# Expert Parallelism Load Balancer (EPLB)
+This benchmark uses SkyDiscover to optimize the Expert Parallelism Load Balancer (EPLB) algorithm for Mixture-of-Expert (MoE) models. The goal is to rearrange and replicate experts across GPUs to balance load, while keeping the rearrangement algorithm itself fast.
+## Setup
+1. **Install PyTorch** (required by the evaluator):
+   ```bash
+   uv pip install torch
+   ```
+2. **Download the workload file** from [Hugging Face](https://huggingface.co/datasets/abmfy/eplb-openevolve) into this directory:
+   ```bash
+   cd benchmarks/ADRS/eplb
+   wget https://huggingface.co/datasets/abmfy/eplb-openevolve/resolve/main/expert-load.json
+   ```
+3. **Set your API key:**
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+## Run
+From the repo root:
+```bash
+uv run skydiscover-run \
+  benchmarks/ADRS/eplb/initial_program.py \
+  benchmarks/ADRS/eplb/evaluator.py \
+  -c benchmarks/ADRS/eplb/config.yaml \
+  -s [your_algorithm] \
+  -i 100 \
+  -o eplb_output
+```
+Or from this directory:
+```bash
+uv run skydiscover-run initial_program.py evaluator.py \
+  -c config.yaml \
+  -s [your_algorithm] \
+  -i 100
+```
+## Evaluate a saved program
+```bash
+python evaluate_best_program.py
+```
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.py` | Baseline `rebalance_experts` function to evolve |
+| `evaluator.py` | Scores programs on load-balance quality and execution speed |
+| `config.yaml` | Task-specific config (LLM, evaluator timeout, system prompt) |
+| `evaluate_best_program.py` | Standalone script to evaluate a saved best program |
+| `expert-load.json` | Workload data (must be downloaded — see Setup) |

benchmarks/ADRS/eplb/initial_program.py ADDED Viewed

	@@ -0,0 +1,238 @@

+# SPDX-License-Identifier: Apache-2.0
+"""
+Expert parallelism load balancer (EPLB) for vLLM.
+This module implements the core rearrangement algorithm.
+The rearrangement algorithm is adapted from
+[DeepSeek EPLB](https://github.com/deepseek-ai/eplb).
+Please find at [#12](https://github.com/deepseek-ai/EPLB/issues/12) an example
+on how the EPLB algorithm works.
+"""
+# EVOLVE-BLOCK-START
+import torch
+def balanced_packing(weight: torch.Tensor,
+                     num_packs: int) -> tuple[torch.Tensor, torch.Tensor]:
+    """
+    Pack n weighted objects to m packs, such that each bin contains exactly
+    n/m objects and the weights of all packs are as balanced as possible.
+    Parameters:
+        weight: [X, n], the weight of each item
+        num_packs: number of packs
+    Returns:
+        pack_index: [X, n], the pack index of each item
+        rank_in_pack: [X, n], the rank of the item in the pack
+    """
+    num_layers, num_groups = weight.shape
+    assert num_groups % num_packs == 0
+    groups_per_pack = num_groups // num_packs
+    if groups_per_pack == 1:
+        pack_index = torch.arange(weight.size(-1),
+                                  dtype=torch.int64,
+                                  device=weight.device).expand(weight.shape)
+        rank_in_pack = torch.zeros_like(weight, dtype=torch.int64)
+        return pack_index, rank_in_pack
+    indices = weight.float().sort(-1, descending=True).indices.cpu()
+    pack_index = torch.full_like(weight,
+                                 fill_value=-1,
+                                 dtype=torch.int64,
+                                 device="cpu")
+    rank_in_pack = torch.full_like(pack_index, fill_value=-1)
+    for i in range(num_layers):
+        pack_weights = [0] * num_packs
+        pack_items = [0] * num_packs
+        for group in indices[i]:
+            pack = min(
+                (i
+                 for i in range(num_packs) if pack_items[i] < groups_per_pack),
+                key=pack_weights.__getitem__,
+            )
+            assert pack_items[pack] < groups_per_pack
+            pack_index[i, group] = pack
+            rank_in_pack[i, group] = pack_items[pack]
+            pack_weights[pack] += weight[i, group]
+            pack_items[pack] += 1
+    return pack_index, rank_in_pack
+def replicate_experts(
+        weight: torch.Tensor,
+        num_phy: int) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Replicate `num_log` experts to `num_phy` replicas, such that the maximum
+    load of all replicas is minimized.
+    Parameters:
+        weight: [X, num_log]
+        num_phy: total number of experts after replication
+    Returns:
+        phy2log: [X, num_phy], logical expert id of each physical expert
+        rank: [X, num_phy], the replica rank
+        logcnt: [X, num_log], number of replicas for each logical expert
+    """
+    n, num_log = weight.shape
+    num_redundant = num_phy - num_log
+    assert num_redundant >= 0
+    device = weight.device
+    phy2log = torch.arange(num_phy, dtype=torch.int64,
+                           device=device).repeat(n, 1)
+    rank = torch.zeros(n, num_phy, dtype=torch.int64, device=device)
+    logcnt = torch.ones(n, num_log, dtype=torch.int64, device=device)
+    arangen = torch.arange(n, dtype=torch.int64, device=device)
+    for i in range(num_log, num_phy):
+        redundant_indices = (weight / logcnt).max(dim=-1).indices
+        phy2log[:, i] = redundant_indices
+        rank[:, i] = logcnt[arangen, redundant_indices]
+        logcnt[arangen, redundant_indices] += 1
+    return phy2log, rank, logcnt
+def rebalance_experts_hierarchical(
+    weight: torch.Tensor,
+    num_physical_experts: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+):
+    """
+    Parameters:
+        weight: [num_moe_layers, num_logical_experts]
+        num_physical_experts: number of physical experts after replication
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+        (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [num_moe_layers, num_physical_experts]
+        logical_to_physical_map: [num_moe_layers, num_logical_experts, X]
+        logical_count: [num_moe_layers, num_logical_experts]
+    """
+    num_layers, num_logical_experts = weight.shape
+    assert num_logical_experts % num_groups == 0
+    group_size = num_logical_experts // num_groups
+    assert num_groups % num_nodes == 0
+    groups_per_node = num_groups // num_nodes
+    assert num_gpus % num_nodes == 0
+    assert num_physical_experts % num_gpus == 0
+    phy_experts_per_gpu = num_physical_experts // num_gpus
+    def inverse(perm: torch.Tensor) -> torch.Tensor:
+        inv = torch.empty_like(perm)
+        inv.scatter_(
+            1,
+            perm,
+            torch.arange(perm.size(1), dtype=torch.int64,
+                         device=perm.device).expand(perm.shape),
+        )
+        return inv
+    # Step 1: pack groups to nodes
+    tokens_per_group = weight.unflatten(-1, (num_groups, group_size)).sum(-1)
+    group_pack_index, group_rank_in_pack = balanced_packing(
+        tokens_per_group, num_nodes)
+    log2mlog = (((group_pack_index * groups_per_node + group_rank_in_pack) *
+                 group_size).unsqueeze(-1) +
+                torch.arange(group_size,
+                             dtype=torch.int64,
+                             device=group_pack_index.device)).flatten(-2)
+    mlog2log = inverse(log2mlog)
+    # Step 2: construct redundant experts within nodes
+    # [num_layers * num_nodes, num_logical_experts // num_nodes]
+    tokens_per_mlog = weight.gather(-1, mlog2log).view(
+        -1, num_logical_experts // num_nodes)
+    phy2mlog, phyrank, mlogcnt = replicate_experts(
+        tokens_per_mlog, num_physical_experts // num_nodes)
+    # Step 3: pack physical_experts to GPUs
+    # [num_layers * num_nodes, num_physical_experts // num_nodes]
+    tokens_per_phy = (tokens_per_mlog / mlogcnt).gather(-1, phy2mlog)
+    pack_index, rank_in_pack = balanced_packing(tokens_per_phy,
+                                                num_gpus // num_nodes)
+    phy2pphy = pack_index * phy_experts_per_gpu + rank_in_pack
+    pphy2phy = inverse(phy2pphy)
+    pphy2mlog = phy2mlog.gather(
+        -1, pphy2phy)  # [num_layers * num_nodes, num_log_per_nodes]
+    pphy2mlog = (pphy2mlog.view(num_layers, num_nodes, -1) + torch.arange(
+        0,
+        num_logical_experts,
+        num_logical_experts // num_nodes,
+        device=group_pack_index.device,
+    ).view(1, -1, 1)).flatten(-2)
+    pphy2log = mlog2log.gather(-1, pphy2mlog)
+    pphyrank = phyrank.gather(-1, pphy2phy).view(num_layers, -1)
+    logcnt = mlogcnt.view(num_layers, -1).gather(-1, log2mlog)
+    return pphy2log, pphyrank, logcnt
+def rebalance_experts(
+    weight: torch.Tensor,
+    num_replicas: int,
+    num_groups: int,
+    num_nodes: int,
+    num_gpus: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Entry point for expert-parallelism load balancer.
+    Parameters:
+        weight: [layers, num_logical_experts], the load statistics for all
+            logical experts
+        num_replicas: number of physical experts, must be a multiple of
+            `num_gpus`
+        num_groups: number of expert groups
+        num_nodes: number of server nodes, where the intra-node network
+            (e.g, NVLink) is faster
+        num_gpus: number of GPUs, must be a multiple of `num_nodes`
+    Returns:
+        physical_to_logical_map: [layers, num_replicas], the expert index of
+            each replica
+        logical_to_physical_map: [layers, num_logical_experts, X], the replica
+            indices for each expert
+        expert_count: [layers, num_logical_experts], number of physical
+            replicas for each logical expert
+    """
+    num_layers, num_logical_experts = weight.shape
+    weight = weight.float().cpu()
+    if num_groups % num_nodes == 0:
+        # use hierarchical load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, num_groups, num_nodes, num_gpus)
+    else:
+        # use global load-balance policy
+        phy2log, phyrank, logcnt = rebalance_experts_hierarchical(
+            weight, num_replicas, 1, 1, num_gpus)
+    num_redundant_experts = num_replicas - num_logical_experts
+    maxlogcnt = num_redundant_experts + 1
+    log2phy: torch.Tensor = torch.full(
+        (num_layers, num_logical_experts, maxlogcnt),
+        -1,
+        dtype=torch.int64,
+        device=logcnt.device,
+    )
+    log2phy.view(num_layers, -1).scatter_(
+        -1,
+        phy2log * maxlogcnt + phyrank,
+        torch.arange(num_replicas, dtype=torch.int64,
+                     device=log2phy.device).expand(num_layers, -1),
+    )
+    return phy2log, log2phy, logcnt
+# EVOLVE-BLOCK-END
+__all__ = ["rebalance_experts"]

benchmarks/ADRS/llm_sql/evaluator/utils.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from concurrent.futures import ThreadPoolExecutor
+import pandas as pd
+from typing import List, Tuple
+class TrieNode:
+    def __init__(self):
+        self.children = {}
+        self.end_of_word = False
+class Trie:
+    def __init__(self):
+        self.root = TrieNode()
+    def insert(self, word):
+        node = self.root
+        for char in word:
+            if char not in node.children:
+                node.children[char] = TrieNode()
+            node = node.children[char]
+        node.end_of_word = True
+    def longest_common_prefix(self, word):
+        node = self.root
+        common_prefix_length = 0
+        for char in word:
+            if char in node.children:
+                common_prefix_length += len(char)
+                node = node.children[char]
+            else:
+                break
+        return common_prefix_length
+def calculate_length(value):
+    val = 0
+    if isinstance(value, bool):
+        val = 4  # length of 'True' or 'False'
+    elif isinstance(value, (int, float)):
+        val = len(str(value))
+    elif isinstance(value, str):
+        val = len(value)
+    else:
+        val = 0
+    return val**2
+def evaluate_df_prefix_hit_cnt(df: pd.DataFrame) -> Tuple[int, int]:
+    """
+    Function to evaluate the prefix hit count of a DataFrame
+    """
+    def max_overlap(trie, row_string):
+        return min(len(row_string), trie.longest_common_prefix(row_string))
+    trie = Trie()
+    total_prefix_hit_count = 0
+    total_string_length = 0
+    def process_row(index, row):
+        nonlocal total_string_length
+        row_string = "".join(row.fillna("").astype(str).values)  # No spaces between columns
+        total_string_length += len(row_string)
+        row_prefix_hit_count = max_overlap(trie, row_string)
+        trie.insert(row_string)
+        return row_prefix_hit_count
+    with ThreadPoolExecutor() as executor:
+        results = executor.map(process_row, df.index, [row for _, row in df.iterrows()])
+    total_prefix_hit_count = sum(results)
+    total_prefix_hit_rate = total_prefix_hit_count / total_string_length
+    assert total_prefix_hit_count <= total_string_length
+    print(f"Total string length: {total_string_length}")
+    no_cache_pricing = 2.5 / 5  # per 1M if not cached
+    cache_pricing = 1.25 / 5  # per 1M if cached
+    cached_tokens_pricing = total_prefix_hit_count * cache_pricing / 1e6
+    non_cached_tokens_pricing = (total_string_length - total_prefix_hit_count) * no_cache_pricing / 1e6
+    print(
+        f"Cached tokens pricing = {round(cached_tokens_pricing,2)}, Non-cached tokens pricing = {round(non_cached_tokens_pricing,2)}, total pricing = {round(cached_tokens_pricing + non_cached_tokens_pricing,2)}"
+    )
+    return total_prefix_hit_count, total_prefix_hit_rate * 100

benchmarks/ADRS/prism/config.yaml ADDED Viewed

	@@ -0,0 +1,24 @@

+# Prism (GPU Model Placement) — Prompt Caching Column Reordering Optimization
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 5
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |-
+    You are an expert for model placement on GPUs. Your task is to improve a model placement algorithm by improve the function named compute_model_placement in the intial program that places models to available GPUs.
+    The algorithm must MINIMIZE the maximum KVPR across all GPUs while ensuring models can fit into the GPUs' memory. Note that KVPR is KV cache pressure for a GPU. It indicates how crowded a GPU is. For a specific GPU, its KVPR is computed as sum(model.req_rate/model.slo for model in models) / (GPU_MEM_SIZE - sum(model.model_size for model in models)), where models are the models on this GPU. The generated program should be as simple as possible and the code should be executed correctly without errors.
+evaluator:
+  timeout: 360

benchmarks/ADRS/prism/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict, bridging them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/ADRS/prism/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import importlib.util
+import numpy as np
+import time
+import concurrent.futures
+import traceback
+from dataclasses import dataclass
+GPU_MEM_SIZE = 80 # GB
+MIN_INT = float('-inf')  # Define MIN_INT as negative infinity
+@dataclass
+class Model:
+    model_name: str
+    model_size: int
+    req_rate: int
+    slo: int
+    cur_gpu_id: int
+def run_with_timeout(func, args=(), kwargs={}, timeout_seconds=30):
+    """
+    Run a function with a timeout using concurrent.futures
+    """
+    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+        future = executor.submit(func, *args, **kwargs)
+        try:
+            result = future.result(timeout=timeout_seconds)
+            return result
+        except concurrent.futures.TimeoutError:
+            raise TimeoutError(f"Function timed out after {timeout_seconds} seconds")
+def safe_float(value):
+    """Convert a value to float safely"""
+    try:
+        if np.isnan(value) or np.isinf(value):
+            return 0.0
+        return float(value)
+    except (TypeError, ValueError):
+        return 0.0
+def verify_gpu_mem_constraint(placement_data: dict[int, list[Model]]) -> bool:
+    """
+    Verify the whether models can fit into GPU memory
+    """
+    # Check if the placement data is valid
+    if placement_data is None:
+        return False
+    # Check if the placement data is valid
+    for gpu_id, models in placement_data.items():
+        if sum(model.model_size for model in models) > GPU_MEM_SIZE:
+            return False
+    return True
+def calculate_kvcache_pressure(placement_data: dict[int, list[Model]]) -> float:
+    """
+    Calculate the KVCache pressure
+    """
+    max_kvpr = MIN_INT
+    for gpu_id, models in placement_data.items():
+        total_model_size = sum(model.model_size for model in models)
+        total_weighted_req_rate = sum(model.req_rate / model.slo for model in models)
+        if GPU_MEM_SIZE - total_model_size > 0:
+            kvpr = total_weighted_req_rate / (GPU_MEM_SIZE - total_model_size)
+        else:
+            kvpr = 1000000
+        max_kvpr = max(max_kvpr, kvpr)
+    return max_kvpr
+def generate_test_gpu_models(num_tests=50):
+    """
+    Generate multiple test signals with different characteristics
+    """
+    test_cases = []
+    np.random.seed(42)
+    for i in range(num_tests):
+        gpu_num = np.random.randint(5, 10)
+        gpu_models = []
+        for j in range(gpu_num*2):
+            model_size = np.random.randint(10, 30)
+            req_rate = np.random.randint(1, 10)
+            slo = np.random.randint(5, 10)
+            gpu_models.append(Model(model_name=f"model_{j}", model_size=model_size, req_rate=req_rate, slo=slo, cur_gpu_id=j))
+        test_cases.append((gpu_num, gpu_models))
+    return test_cases
+def evaluate(program_path):
+    """
+    Main evaluation function that tests the signal processing algorithm
+    on multiple test signals and calculates the composite performance metric.
+    """
+    try:
+        # Load the program
+        spec = importlib.util.spec_from_file_location("program", program_path)
+        program = importlib.util.module_from_spec(spec)
+        spec.loader.exec_module(program)
+        # Check if required function exists
+        if not hasattr(program, "compute_model_placement"):
+            return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": "Missing compute_model_placement function",
+                }
+        # Generate test gpu and models
+        test_gpu_models = generate_test_gpu_models()
+        # Collect metrics across all tests
+        all_kvpr = []
+        all_metrics = []
+        successful_runs = 0
+        for i, (gpu_num, gpu_models) in enumerate(test_gpu_models):
+            try:
+                # Run the algorithm with timeout
+                start_time = time.time()
+                # Call the program's main function
+                result = run_with_timeout(
+                    program.compute_model_placement,
+                    kwargs={
+                        'gpu_num': gpu_num,
+                        'models': gpu_models
+                    },
+                    timeout_seconds=10
+                )
+                execution_time = time.time() - start_time
+                # Validate result format
+                if not isinstance(result, dict):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Placement {i}: Expected dict, got {type(result).__name__}",
+                    }
+                # Validate all models are placed
+                placed_models = []
+                for gpu_id, assigned_models in result.items():
+                    if not isinstance(assigned_models, list):
+                        return {
+                            "max_kvpr": 0.0,
+                            "success_rate": 0.0,
+                            "combined_score": 0.0,
+                            "error": f"GPU {gpu_id} value must be list, got {type(assigned_models).__name__}",
+                        }
+                    placed_models.extend(assigned_models)
+                if len(placed_models) != len(gpu_models):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Not all models placed: {len(placed_models)}/{len(gpu_models)}",
+                    }
+                # Check for duplicate placements (by object identity)
+                placed_ids = [id(m) for m in placed_models]
+                if len(set(placed_ids)) != len(placed_ids):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"Duplicate models detected",
+                    }
+                # Check placed models are the exact input objects
+                original_ids = {id(m) for m in gpu_models}
+                if set(placed_ids) != original_ids:
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": "Placed models don't match input models (missing or foreign models)",
+                    }
+                # Verify GPU memory constraints
+                if not verify_gpu_mem_constraint(result):
+                    return {
+                        "max_kvpr": 0.0,
+                        "success_rate": 0.0,
+                        "combined_score": 0.0,
+                        "error": f"GPU memory constraint violated",
+                    }
+                # Calculate metrics using the generated test signal
+                max_kvpr = calculate_kvcache_pressure(result)
+                # Store metrics
+                metrics = {
+                    'max_kvpr': safe_float(max_kvpr),
+                    'execution_time': safe_float(execution_time),
+                }
+                all_kvpr.append(safe_float(max_kvpr))
+                all_metrics.append(metrics)
+                successful_runs += 1
+            except TimeoutError:
+                print(f"Placement {i}: Timeout")
+                continue
+            except Exception as e:
+                print(f"Placement {i}: Error - {str(e)}")
+                continue
+        # If no successful runs, return minimal scores
+        if successful_runs == 0:
+            return {
+                    "max_kvpr": 0.0,
+                    "success_rate": 0.0,
+                    "combined_score": 0.0,
+                    "error": "All test signals failed"
+                }
+        print(all_metrics)
+        # Calculate aggregate metrics
+        avg_kvpr = np.mean(all_kvpr)
+        if avg_kvpr != 0:
+            avg_kvpr = 1.0 / avg_kvpr
+        avg_execution_time = np.mean([m['execution_time'] for m in all_metrics])
+        success_rate = successful_runs / len(test_gpu_models)
+        return {
+                "max_kvpr": safe_float(avg_kvpr),
+                "execution_time": safe_float(avg_execution_time),
+                "success_rate": safe_float(success_rate),
+                "combined_score": safe_float(avg_kvpr) + safe_float(success_rate),
+            }
+    except Exception as e:
+        print(f"Evaluation failed: {str(e)}")
+        print(traceback.format_exc())
+        return {
+                "max_kvpr": 0.0,
+                "success_rate": 0.0,
+                "combined_score": 0.0,
+                "error": str(e)
+            }
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is auto-injected at build time from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(evaluate)

benchmarks/ADRS/prism/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ numpy

benchmarks/ADRS/prism/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/arc_benchmark/config.yaml ADDED Viewed

	@@ -0,0 +1,51 @@

+# ARC Benchmark base config
+# This file is used by generate_config.py to inject a task-specific prompt.
+# Switch models by editing the 'llm' section below.
+# General settings
+max_iterations: 30
+checkpoint_interval: 10
+log_level: "INFO"
+random_seed: 42
+diff_based_generation: true
+max_solution_length: 50000
+# LLM configuration
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: "https://api.openai.com/v1"
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32768
+  timeout: 3000
+# Option B: Gemini 3 Pro (comment Option A and uncomment below)
+# llm:
+#   models:
+#     - name: "gemini-3-pro-preview"
+#       weight: 1.0
+#   api_base: "https://generativelanguage.googleapis.com/v1beta/openai/"
+#   temperature: 0.7
+#   top_p: 0.95
+#   max_tokens: 32768
+#   timeout: 3000
+# Search configuration (default: top-k)
+search:
+  type: "topk"
+  database:
+    random_seed: 42
+  num_context_programs: 4
+# Prompt configuration
+# NOTE: generate_config.py overwrites prompt.system_message per task.
+prompt:
+  system_message: "PLACEHOLDER_REPLACED_BY_GENERATE_CONFIG"
+# Evaluator configuration
+evaluator:
+  timeout: 360
+  max_retries: 3
+  cascade_evaluation: false

benchmarks/arc_benchmark/convert_arc_agi2_data.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""
+Convert ARC-AGI-2-style data (data/training/*.json, data/evaluation/*.json)
+into the format expected by this benchmark:
+  - arc-agi_{split}_challenges.json  (task_id -> { train, test with inputs only })
+  - arc-agi_{split}_solutions.json   (task_id -> list of test output grids)
+Usage (from benchmarks/arc_benchmark, with data already in ./data/training and ./data/evaluation):
+  OUT_DIR=./data python3 convert_arc_agi2_data.py .
+Or with an external ARC-AGI-2 clone:
+  python3 convert_arc_agi2_data.py /path/to/ARC-AGI-2
+  # Writes into that path by default; set OUT_DIR to write elsewhere.
+"""
+import json
+import os
+import sys
+def convert_split(repo_root: str, split: str, out_dir: str) -> None:
+    """Convert data/{split}/*.json into challenges + solutions JSON."""
+    split_dir = os.path.join(repo_root, "data", split)
+    if not os.path.isdir(split_dir):
+        print(f"Skip {split}: no directory {split_dir}")
+        return
+    challenges = {}
+    solutions = {}
+    for name in sorted(os.listdir(split_dir)):
+        if not name.endswith(".json"):
+            continue
+        task_id = name[:-5]  # strip .json
+        path = os.path.join(split_dir, name)
+        with open(path, "r") as f:
+            task = json.load(f)
+        # Challenge: train as-is; test with only "input" (no output)
+        challenges[task_id] = {
+            "train": task["train"],
+            "test": [{"input": p["input"]} for p in task["test"]],
+        }
+        # Solutions: list of test output grids
+        solutions[task_id] = [p["output"] for p in task["test"]]
+    challenges_path = os.path.join(out_dir, f"arc-agi_{split}_challenges.json")
+    solutions_path = os.path.join(out_dir, f"arc-agi_{split}_solutions.json")
+    with open(challenges_path, "w") as f:
+        json.dump(challenges, f)
+    with open(solutions_path, "w") as f:
+        json.dump(solutions, f)
+    print(f"Wrote {challenges_path} ({len(challenges)} tasks)")
+    print(f"Wrote {solutions_path} ({len(solutions)} tasks)")
+def main():
+    repo_root = os.path.abspath(sys.argv[1] if len(sys.argv) > 1 else ".")
+    out_dir = os.getenv("OUT_DIR", repo_root)
+    for split in ("training", "evaluation"):
+        convert_split(repo_root, split, out_dir)
+if __name__ == "__main__":
+    main()

benchmarks/arc_benchmark/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/arc_benchmark/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/arc_benchmark/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,407 @@

+import numpy as np
+from typing import List, Tuple, Dict, Any
+import json
+import os
+try:
+    from skydiscover.evaluation.evaluation_result import EvaluationResult
+except ImportError:
+    from dataclasses import dataclass, field
+    from typing import Union
+    @dataclass
+    class EvaluationResult:
+        metrics: Dict[str, float]
+        artifacts: Dict[str, Union[str, bytes]] = field(default_factory=dict)
+import importlib.util
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+DATA_ROOT = os.getenv("DATA_ROOT", os.path.join(os.path.dirname(os.path.abspath(__file__)), "data"))
+INCLUDE_TEST = os.getenv("ARC_EVAL_INCLUDE_TEST", "0").lower() in ("1", "true", "yes")
+USE_TEST_IN_SCORE = os.getenv("ARC_EVAL_USE_TEST_FOR_SCORE", "0").lower() in ("1", "true", "yes")
+def cell_accuracy_single(pred: np.ndarray, gt: np.ndarray) -> float:
+    """
+    Compute continuous cell-level accuracy between prediction and ground truth.
+    Returns a float in [0, 1]. Handles shape mismatches gracefully.
+    """
+    if pred.shape != gt.shape:
+        # Partial credit for getting shape partially right
+        shape_score = 0.0
+        if len(pred.shape) == len(gt.shape) == 2:
+            row_match = 1.0 if pred.shape[0] == gt.shape[0] else 0.0
+            col_match = 1.0 if pred.shape[1] == gt.shape[1] else 0.0
+            shape_score = (row_match + col_match) * 0.1  # up to 0.2 for correct dimensions
+        return shape_score
+    # Cell-level accuracy
+    total_cells = gt.size
+    if total_cells == 0:
+        return 1.0
+    correct_cells = int(np.sum(pred == gt))
+    return correct_cells / total_cells
+def best_attempt_cell_accuracy(attempts: List[np.ndarray], gt: np.ndarray) -> float:
+    """Return the best cell accuracy across all attempts for one example."""
+    return max(cell_accuracy_single(a, gt) for a in attempts)
+def pass_at_2_accuracy_single(
+    attempts: List[np.ndarray],
+    gt: np.ndarray
+) -> Tuple[int, Dict[int, Any]]:
+    """
+    Compute pass@2 accuracy for a single ARC test case.
+    Args:
+        attempts: List of 2 numpy arrays representing model attempts.
+        gt: Ground-truth output as a 2D numpy array.
+    Returns:
+        pass_at_2: int (1 if any attempt is perfectly correct, else 0)
+        diagnostics: dict mapping attempt index -> diagnostic info.
+                     If sizes match, includes indices of incorrect cells.
+    """
+    assert len(attempts) == 2, "Expected exactly 2 attempts for pass@2 evaluation."
+    diagnostics = {}
+    passed = False
+    for i, pred in enumerate(attempts):
+        attempt_info = {}
+        # Size check
+        if pred.shape != gt.shape:
+            attempt_info["size_match"] = False
+            attempt_info["pred_shape"] = list(pred.shape)
+            attempt_info["gt_shape"] = list(gt.shape)
+            attempt_info["incorrect_indices"] = None
+            attempt_info["cell_accuracy"] = 0.0
+            attempt_passed = False
+        else:
+            attempt_info["size_match"] = True
+            # Find incorrect cells
+            incorrect_mask = pred != gt
+            incorrect_indices = np.argwhere(incorrect_mask)
+            attempt_info["incorrect_indices"] = incorrect_indices.tolist()
+            attempt_info["num_incorrect"] = int(incorrect_mask.sum())
+            attempt_info["num_total"] = int(gt.size)
+            attempt_info["cell_accuracy"] = float(np.sum(~incorrect_mask)) / gt.size
+            # Perfect match
+            if incorrect_mask.sum() == 0:
+                attempt_passed = True
+            else:
+                attempt_passed = False
+        attempt_info["perfect_match"] = attempt_passed
+        passed = attempt_passed or passed
+        diagnostics[i] = attempt_info
+    pass_at_2 = 1 if passed else 0
+    return pass_at_2, diagnostics
+def pass_at_2_accuracy_multi_test(
+    all_attempts: List[List[np.ndarray]],
+    all_gt: List[np.ndarray]
+) -> Tuple[List[int], List[Dict[int, Any]]]:
+    """
+    Compute pass@2 accuracy across multiple ARC test cases.
+    Args:
+        all_attempts: List of lists of 2 numpy arrays for each test case.
+        all_gt: List of ground-truth outputs as 2D numpy arrays.
+    """
+    assert len(all_attempts) == len(all_gt), "Mismatched number of test cases."
+    all_diagnostics = []
+    all_pass = []
+    for attempts, gt in zip(all_attempts, all_gt):
+        pass_at_2, diagnostics = pass_at_2_accuracy_single(attempts, gt)
+        all_pass.append(pass_at_2)
+        all_diagnostics.append(diagnostics)
+    return all_pass, all_diagnostics
+def extract_failure_artifacts(diagnostics, pred=None, gt=None):
+    """
+    Extract failure artifacts from diagnostics for a given example.
+    Includes actual vs expected output snippets for better LLM feedback.
+    """
+    artifacts = {}
+    if not diagnostics["size_match"]:
+        artifacts["error_type"] = "SizeMismatch"
+        artifacts["error_message"] = (
+            f"Output shape {diagnostics['pred_shape']} does not match "
+            f"expected shape {diagnostics['gt_shape']}."
+        )
+        artifacts["suggestion"] = (
+            f"Your output has shape {diagnostics['pred_shape']} but the correct output "
+            f"has shape {diagnostics['gt_shape']}. Review how you determine output dimensions."
+        )
+    else:
+        num_incorrect = diagnostics['num_incorrect']
+        num_total = diagnostics['num_total']
+        accuracy = diagnostics['cell_accuracy']
+        artifacts["error_type"] = "IncorrectCells"
+        artifacts["error_message"] = (
+            f"{num_incorrect}/{num_total} cells incorrect "
+            f"(cell accuracy: {accuracy:.1%})."
+        )
+        # Show a compact diff of expected vs actual for first few wrong cells
+        if diagnostics['incorrect_indices'] and pred is not None and gt is not None:
+            wrong = diagnostics['incorrect_indices'][:8]  # first 8 wrong cells
+            diff_lines = []
+            for r, c in wrong:
+                diff_lines.append(f"  [{r},{c}]: got {int(pred[r,c])}, expected {int(gt[r,c])}")
+            artifacts["cell_diffs"] = "\n".join(diff_lines)
+            if len(diagnostics['incorrect_indices']) > 8:
+                artifacts["cell_diffs"] += f"\n  ... and {len(diagnostics['incorrect_indices'])-8} more"
+        artifacts["suggestion"] = (
+            f"Your solution gets {accuracy:.1%} of cells correct. "
+            f"Review the transformation logic for the failing cells."
+        )
+    return artifacts
+def evaluate(program_path):
+    """
+    Evaluate the program on ARC task training (and optionally test) examples.
+    Returns a combined_score that blends:
+      - pass@2 (binary perfect-match, weighted 0.6)
+      - cell accuracy (continuous partial credit, weighted 0.4)
+    This gives evolution gradient signal even when no example is solved perfectly.
+    """
+    spec = importlib.util.spec_from_file_location("program_module", program_path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    challenge_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_challenges.json")
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    task = tasks[task_id]
+    train_inputs = [np.array(inp["input"]) for inp in task['train']]
+    train_gts = [np.array(gt["output"]) for gt in task['train']]
+    train_attempts = []
+    # Generate attempts for training data
+    for inp in train_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            return EvaluationResult(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        train_attempts.append([attempt_1, attempt_2])
+    pass_at_2_train, train_diagnostics_list = pass_at_2_accuracy_multi_test(train_attempts, train_gts)
+    # Compute both binary pass@2 and continuous cell accuracy
+    train_pass_score = sum(pass_at_2_train) / len(pass_at_2_train)
+    train_cell_acc = sum(
+        best_attempt_cell_accuracy(attempts, gt)
+        for attempts, gt in zip(train_attempts, train_gts)
+    ) / len(train_gts)
+    # Blended score: pass@2 (60%) + cell accuracy (40%) gives gradient signal
+    train_score = 0.6 * train_pass_score + 0.4 * train_cell_acc
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": train_score,
+        "train_combined_score": train_score,
+        "train_pass_at_2_score": train_pass_score,
+        "train_cell_accuracy": round(train_cell_acc, 4),
+    }
+    error_artifacts = {}
+    for i, (train_pass, train_diagnostics) in enumerate(zip(pass_at_2_train, train_diagnostics_list)):
+        example_name = f"train_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = train_pass
+        best_acc = best_attempt_cell_accuracy(train_attempts[i], train_gts[i])
+        metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+        for attempt in train_diagnostics:
+            attempt_pass = train_diagnostics[attempt]["perfect_match"]
+            metrics[f"{example_name}_attempt_{attempt}"] = attempt_pass
+            if not attempt_pass:
+                pred = train_attempts[i][attempt]
+                gt = train_gts[i]
+                error_artifacts[f"{example_name}_attempt_{attempt}_diagnostics"] = extract_failure_artifacts(
+                    train_diagnostics[attempt], pred=pred, gt=gt
+                )
+    # Optional: include test feedback (uses solutions if available)
+    if INCLUDE_TEST:
+        solution_path = os.path.join(DATA_ROOT, f"arc-agi_{TASK_FILE}_solutions.json")
+        if os.path.isfile(solution_path):
+            with open(solution_path, 'r') as f:
+                solutions = json.load(f)
+            task_id = list(tasks.keys())[int(TASK_NUM)]
+            solution = solutions.get(task_id)
+            if solution is not None and "test" in task:
+                if len(task["test"]) != len(solution):
+                    raise ValueError(
+                        f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+                        f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+                        f"and arc-agi_{TASK_FILE}_solutions.json were generated together."
+                    )
+                test_inputs = [np.array(inp["input"]) for inp in task['test']]
+                test_gts = [np.array(gt) for gt in solution]
+                test_attempts = []
+                for inp in test_inputs:
+                    attempt_1 = program_module.transform_grid_attempt_1(inp)
+                    if not isinstance(attempt_1, np.ndarray):
+                        print(f"transform_grid_attempt_1 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_1 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_1 returns a 2D numpy array."
+                            }
+                        )
+                    attempt_2 = program_module.transform_grid_attempt_2(inp)
+                    if not isinstance(attempt_2, np.ndarray):
+                        print(f"transform_grid_attempt_2 did not return a numpy array (test)")
+                        return EvaluationResult(
+                            metrics={
+                                "runs_successfully": 0.0,
+                                "combined_score": 0.0,
+                                "error": "transform_grid_attempt_2 did not return a numpy array (test)"
+                            },
+                            artifacts={
+                                "error_type": "InvalidReturnType",
+                                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array (test).",
+                                "suggestion": "Make sure transform_grid_attempt_2 returns a 2D numpy array."
+                            }
+                        )
+                    test_attempts.append([attempt_1, attempt_2])
+                pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+                test_pass_score = sum(pass_at_2_test) / len(pass_at_2_test)
+                test_cell_acc = sum(
+                    best_attempt_cell_accuracy(attempts, gt)
+                    for attempts, gt in zip(test_attempts, test_gts)
+                ) / len(test_gts)
+                test_score = 0.6 * test_pass_score + 0.4 * test_cell_acc
+                metrics["test_combined_score"] = test_score
+                metrics["test_pass_at_2_score"] = test_pass_score
+                metrics["test_cell_accuracy"] = round(test_cell_acc, 4)
+                metrics["test_included"] = 1
+                for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+                    example_name = f"test_example_{i}"
+                    metrics[f"{example_name}_pass_at_2"] = test_pass
+                    best_acc = best_attempt_cell_accuracy(test_attempts[i], test_gts[i])
+                    metrics[f"{example_name}_cell_accuracy"] = round(best_acc, 4)
+                    for attempt in test_diagnostics:
+                        metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+                    if test_pass == 0:
+                        first_failing_idx = next(
+                            (a for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                            0,
+                        )
+                        pred = test_attempts[i][first_failing_idx]
+                        gt = test_gts[i]
+                        error_artifacts[f"{example_name}"] = extract_failure_artifacts(
+                            test_diagnostics[first_failing_idx], pred=pred, gt=gt
+                        )
+                if USE_TEST_IN_SCORE:
+                    metrics["combined_score"] = (train_score + test_score) / 2.0
+            else:
+                metrics["test_included"] = 0
+        else:
+            metrics["test_included"] = 0
+    return EvaluationResult(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+def _evaluate_as_dict(program_path):
+    """Adapter: calls evaluate() and converts EvaluationResult to a plain dict."""
+    result = evaluate(program_path)
+    d = dict(result.metrics)
+    for k, v in result.artifacts.items():
+        d[k] = v
+    return d
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> EvaluationResult to the
+    # container JSON protocol.  wrapper.py is copied from
+    # skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(_evaluate_as_dict)

benchmarks/arc_benchmark/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ numpy

benchmarks/arc_benchmark/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/arc_benchmark/generate_config.py ADDED Viewed

	@@ -0,0 +1,101 @@

+import os
+import yaml
+import json
+def load_task_as_prompt(task_json, task_num):
+    with open(task_json, 'r') as f:
+        tasks = json.load(f)
+    task_id = list(tasks.keys())[int(task_num)]
+    task = tasks[task_id]
+    train_inputs = [inp["input"] for inp in task['train']]
+    train_outputs = [gt["output"] for gt in task['train']]
+    train_pairs = ""
+    for i, (inp, out) in enumerate(zip(train_inputs, train_outputs)):
+        train_pairs += f"In {i} - {inp}\nOut {i} - {out}\n"
+    prompt = f"""You are participating in a puzzle solving competition. You are an expert at solving puzzles.
+Find the common pattern that transforms each input grid into its corresponding output grid.
+Your task is to write python functions that implement the MOST GENERAL transformation rule. The rule must:
+- Apply consistently to ALL training examples
+- Generalize to unseen inputs (critical for success)
+- Be based on structural patterns, not memorized examples
+- Use relative/spatial rules rather than absolute coordinates
+Generalization rules (THIS IS CRITICAL):
+- Infer the transformation ONLY from the training input-output pairs
+- If multiple rules fit the training data, choose the SIMPLEST and MOST GENERAL one
+- Prefer structural/relational rules (shapes, adjacency, symmetry, patterns) over coordinate-based rules
+- Do NOT hardcode any values, coordinates, or specific grid sizes that appear in training examples
+- Think: "What is the underlying principle?" not "What fits these specific examples?"
+- Use numpy only (no external libraries)
+Common failure modes to avoid:
+- Overfitting to specific grid sizes or positions in training examples
+- Hardcoding colors, coordinates, or counts from training data
+- Assuming global properties (like separator colors) without verifying across ALL examples
+- Using absolute positions when relative/structural rules would generalize better
+Solution approach:
+- Analyze the training examples to identify the CORE transformation principle
+- Prefer block-wise, object-wise, or pattern-based rules that work locally
+- If the grid has distinct regions, solve each region independently
+- Build flexible rules that adapt to different input sizes and structures
+Training examples:
+{train_pairs}
+Your task: Write 2 different Python functions that implement the general transformation rule.
+- Each function takes a 2D numpy array as input and returns the transformed 2D numpy array
+- The two attempts should use genuinely different strategies (e.g., different algorithmic approaches)
+- Focus on generalization - your solution will be evaluated on BOTH training examples AND unseen test cases
+CRITICAL: Write general transformations that discover the underlying rule, not memorize the training examples.
+Remember to only output the modified python functions as your solution."""
+    return prompt
+def generate_config(task_num, task_file, dataset_root=None, base_config=None):
+    if dataset_root is None:
+        dataset_root = os.getenv("DATA_ROOT")
+        if not dataset_root:
+            dataset_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    task_json = os.path.join(dataset_root, f"arc-agi_{task_file}_challenges.json")
+    prompt = load_task_as_prompt(task_json, task_num)
+    if base_config is None:
+        default_base = os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.yaml")
+        base_config = os.getenv("BASE_CONFIG", default_base)
+    with open(base_config, 'r') as file:
+        config = yaml.safe_load(file)
+    config['prompt']['system_message'] = prompt
+    # Use OPENAI_API_KEY at runtime if set (keeps real key out of committed config)
+    api_key_env = os.getenv("OPENAI_API_KEY")
+    if api_key_env and api_key_env.strip() and api_key_env != "your-gemini-api-key":
+        config["llm"]["api_key"] = api_key_env.strip()
+    # Override max_iterations from env if set (e.g. by run_discovery.sh)
+    max_iter_env = os.getenv("MAX_ITERATIONS")
+    if max_iter_env is not None and str(max_iter_env).strip() != "":
+        try:
+            config["max_iterations"] = int(max_iter_env)
+        except ValueError:
+            pass
+    # Write to a per-task config file so parallel runs don't conflict
+    out_path = os.getenv("CONFIG_OUT", f"./config_task_{task_num}.yaml")
+    with open(out_path, 'w') as file:
+        yaml.dump(config, file)
+    return out_path
+if __name__ == "__main__":
+    TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+    TASK_NUM = os.getenv("TASK_NUM", 0)
+    path = generate_config(TASK_NUM, TASK_FILE)
+    print(path)

benchmarks/arc_benchmark/initial_program.py ADDED Viewed

	@@ -0,0 +1,42 @@

+# EVOLVE-BLOCK-START
+import numpy as np
+def transform_grid_attempt_1(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Rotate the grid 90 degrees clockwise.
+    - Increment every cell by 1 modulo 10 (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.rot90(arr, k=-1)  # 90 degrees clockwise
+    out = (out + 1) % 10
+    return out.astype(np.int32)
+def transform_grid_attempt_2(grid):
+    """
+    Example transformation:
+    - Validate input (2D, integer values 0-9).
+    - Upsample each cell to a 2x2 block (doubling both dimensions).
+    - Invert colors by mapping v -> 9 - v (keeps values 0-9).
+    Returns a new numpy int array.
+    """
+    arr = _validate_grid(grid)
+    out = np.repeat(np.repeat(arr, 2, axis=0), 2, axis=1)
+    out = 9 - out
+    return out.astype(np.int32)
+# EVOLVE-BLOCK-END
+def _validate_grid(grid):
+    arr = np.asarray(grid)
+    if arr.ndim != 2:
+        raise ValueError("Input must be a 2D array.")
+    # cast to integer type for value checks
+    if not np.issubdtype(arr.dtype, np.integer):
+        arr = arr.astype(int)
+    if arr.size and (arr.min() < 0 or arr.max() > 9):
+        raise ValueError("Array values must be integers in the range 0-9.")
+    return arr

benchmarks/arc_benchmark/post_discovery_eval.py ADDED Viewed

	@@ -0,0 +1,157 @@

+import importlib.util
+import os
+import json
+import numpy as np
+from evaluator import pass_at_2_accuracy_multi_test, extract_failure_artifacts
+TASK_FILE = os.getenv("ARC_TASK_FILE", "training")
+TASK_NUM = os.getenv("TASK_NUM", 0)
+OUTS_DIR = os.getenv("OUTS_DIR", "")
+# Optional: path to a checkpoint dir (e.g. outputs/evaluation_task_0/checkpoints/checkpoint_10) to eval that best_program.py on test set
+PROGRAM_DIR = os.getenv("PROGRAM_DIR", "")
+def _program_path():
+    """Path to best_program.py: PROGRAM_DIR if set, else OUTS_DIR/best/."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "best_program.py")
+    return os.path.join(OUTS_DIR, "best", "best_program.py")
+def _result_path():
+    """Where to write post_evolution_evaluation_result.json."""
+    if PROGRAM_DIR:
+        return os.path.join(PROGRAM_DIR, "post_evolution_evaluation_result.json")
+    return os.path.join(OUTS_DIR, "best", "post_evolution_evaluation_result.json")
+def load_program_module():
+    """Dynamically load the best_program.py module from the specified directory."""
+    path = _program_path()
+    if not os.path.isfile(path):
+        raise FileNotFoundError(f"Program not found: {path}. Set PROGRAM_DIR to a checkpoint dir (e.g. .../checkpoints/checkpoint_10) or ensure OUTS_DIR/best/best_program.py exists.")
+    spec = importlib.util.spec_from_file_location("program_module", path)
+    program_module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(program_module)
+    return program_module
+def evaluate():
+    """Evaluate the program module located in the specified directory."""
+    program_module = load_program_module()
+    if not hasattr(program_module, 'transform_grid_attempt_1') or not hasattr(program_module, 'transform_grid_attempt_2'):
+        print(f"Stage 1 validation failed: Program must define 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.")
+        error_artifacts = {
+                "error_type": "MissingFunction",
+                "error_message": "Stage 1: Program is missing required 'transform_grid_attempt_1' and 'transform_grid_attempt_2' functions.",
+                "suggestion": "Make sure your program includes a functions named 'transform_grid_attempt_1' and 'transform_grid_attempt_2' that take as an argument a 2D numpy array and return a 2D numpy array."
+            }
+        return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "Missing transform_grid_attempt_1 and transform_grid_attempt_2 functions"
+                },
+                artifacts=error_artifacts
+            )
+    # Load ARC tasks
+    data_root = os.getenv("DATA_ROOT")
+    if not data_root:
+        data_root = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
+    challenge_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_challenges.json")
+    solution_path = os.path.join(data_root, f"arc-agi_{TASK_FILE}_solutions.json")
+    with open(challenge_path, 'r') as f:
+        tasks = json.load(f)
+    with open(solution_path, 'r') as f:
+        solutions = json.load(f)
+    task_id = list(tasks.keys())[int(TASK_NUM)]
+    solution = solutions[task_id]
+    task = tasks[task_id]
+    # Sanity check: test inputs and solutions must align (same task, same order)
+    if len(task["test"]) != len(solution):
+        raise ValueError(
+            f"Train/test data mismatch: task {task_id} has {len(task['test'])} test inputs "
+            f"but {len(solution)} solution outputs. Check that arc-agi_{TASK_FILE}_challenges.json "
+            f"and arc-agi_{TASK_FILE}_solutions.json were generated together (convert_arc_agi2_data.py)."
+        )
+    test_inputs = [np.array(inp["input"]) for inp in task['test']]
+    test_gts = [np.array(gt) for gt in solution]
+    test_attempts = []
+    for inp in test_inputs:
+        attempt_1 = program_module.transform_grid_attempt_1(inp)
+        if not isinstance(attempt_1, np.ndarray):
+            print(f"transform_grid_attempt_1 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_1 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_1 function returns a 2D numpy array."
+            }
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_1 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        attempt_2 = program_module.transform_grid_attempt_2(inp)
+        if not isinstance(attempt_2, np.ndarray):
+            print(f"transform_grid_attempt_2 did not return a numpy array")
+            error_artifacts = {
+                "error_type": "InvalidReturnType",
+                "error_message": "Stage 1: transform_grid_attempt_2 did not return a numpy array.",
+                "suggestion": "Make sure your transform_grid_attempt_2 function returns a 2D numpy array."
+            }
+            return dict(
+                metrics={
+                    "runs_successfully": 0.0,
+                    "combined_score": 0.0,
+                    "error": "transform_grid_attempt_2 did not return a numpy array"
+                },
+                artifacts=error_artifacts
+            )
+        test_attempts.append([attempt_1, attempt_2])
+    pass_at_2_test, test_diagnostics_list = pass_at_2_accuracy_multi_test(test_attempts, test_gts)
+    metrics = {
+        "runs_successfully": 1.0,
+        "combined_score": sum(pass_at_2_test) / len(pass_at_2_test),
+    }
+    error_artifacts = {}
+    for i, (test_pass, test_diagnostics) in enumerate(zip(pass_at_2_test, test_diagnostics_list)):
+        example_name = f"test_example_{i}"
+        metrics[f"{example_name}_pass_at_2"] = test_pass
+        for attempt in test_diagnostics:
+            metrics[f"{example_name}_attempt_{attempt}"] = test_diagnostics[attempt]["perfect_match"]
+        if test_pass == 0:
+            # test_diagnostics is {0: {...}, 1: {...}}; extract_failure_artifacts expects one attempt's dict
+            first_failing = next(
+                (test_diagnostics[a] for a in test_diagnostics if not test_diagnostics[a]["perfect_match"]),
+                test_diagnostics[0],
+            )
+            error_artifacts[f"{example_name}"] = extract_failure_artifacts(first_failing)
+    return dict(
+        metrics=metrics,
+        artifacts=error_artifacts
+    )
+if __name__ == "__main__":
+    evaluation_result = evaluate()
+    result_path = _result_path()
+    os.makedirs(os.path.dirname(result_path), exist_ok=True)
+    with open(result_path, 'w') as f:
+        json.dump(evaluation_result, f, indent=4)
+    print(f"Test-set evaluation written to {result_path}")

benchmarks/frontier-cs-eval/README.md ADDED Viewed

	@@ -0,0 +1,72 @@

+# Frontier-CS Benchmark
+Evolves C++ solutions for [Frontier-CS](https://github.com/facebookresearch/Frontier-CS) algorithmic optimization problems using SkyDiscover.
+## Setup
+```bash
+# 1. Clone Frontier-CS
+cd benchmarks/frontier-cs-eval
+git clone https://github.com/FrontierCS/Frontier-CS.git
+# 2. Start the judge server (requires Docker)
+cd Frontier-CS/algorithmic
+docker compose up -d
+# 3. Install dependencies (from project root)
+cd ../../..
+uv sync --extra frontier-cs
+# 4. Set your API key
+export OPENAI_API_KEY=...
+```
+## Run
+Supported algorithms: `adaevolve`, `evox`, `openevolve`, `gepa`, `shinkaevolve`
+Single problem:
+```bash
+cd benchmarks/frontier-cs-eval
+FRONTIER_CS_PROBLEM=0 uv run skydiscover-run initial_program.cpp evaluator.py \
+  -c config.yaml -s [search_algorithm] -i 50
+```
+All problems in parallel:
+```bash
+uv run python run_all_frontiercs.py --search [search_algorithm] --iterations 50 --workers 6
+```
+## Evaluate best programs (post-discovery)
+```bash
+uv run python run_best_programs_frontiercs.py
+```
+## Analyze results
+```bash
+uv run python combine_results.py   # merge training/testing scores into CSV
+uv run python analyze_results.py   # generate plots and statistics
+```
+## Files
+| File | Description |
+|------|-------------|
+| `initial_program.cpp` | Seed C++ program |
+| `evaluator.py` | Evaluates C++ solutions via Frontier-CS docker judge |
+| `config.yaml` | Config with system prompt template |
+| `run_all_frontiercs.py` | Parallelizes evolution across all problems |
+| `run_best_programs_frontiercs.py` | Re-evaluates best programs after evolution |
+| `combine_results.py` | Combines training/testing scores into CSV |
+| `analyze_results.py` | Generates score analysis plots and statistics |
+## Environment variables
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `OPENAI_API_KEY` | (required) | API key |
+| `FRONTIER_CS_PROBLEM` | `0` | Problem ID to evolve |
+| `JUDGE_URLS` | `http://localhost:8081` | Comma-separated judge server URLs |

benchmarks/frontier-cs-eval/analyze_results.py ADDED Viewed

	@@ -0,0 +1,105 @@

+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from pathlib import Path
+# Define paths
+_script_dir = str(Path(__file__).resolve().parent)
+input_csv = str(Path(_script_dir) / "combined_results.csv")
+output_dir = _script_dir
+# Read the CSV file
+df = pd.read_csv(input_csv)
+# Calculate average of training and testing scores
+df['average_score'] = (df['training_score'] + df['testing_score']) / 2
+# Remove rows where either score is None (NaN)
+df_complete = df.dropna(subset=['training_score', 'testing_score'])
+print(f"\n=== Analysis Results ===")
+print(f"Total problems: {len(df)}")
+print(f"Problems with complete data: {len(df_complete)}")
+print(f"\nTraining Scores:")
+print(f"  Mean: {df_complete['training_score'].mean():.4f}")
+print(f"  Median: {df_complete['training_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['training_score'].std():.4f}")
+print(f"  Min: {df_complete['training_score'].min():.4f}")
+print(f"  Max: {df_complete['training_score'].max():.4f}")
+print(f"\nTesting Scores:")
+print(f"  Mean: {df_complete['testing_score'].mean():.4f}")
+print(f"  Median: {df_complete['testing_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['testing_score'].std():.4f}")
+print(f"  Min: {df_complete['testing_score'].min():.4f}")
+print(f"  Max: {df_complete['testing_score'].max():.4f}")
+print(f"\nAverage Scores:")
+print(f"  Mean: {df_complete['average_score'].mean():.4f}")
+print(f"  Median: {df_complete['average_score'].median():.4f}")
+print(f"  Std Dev: {df_complete['average_score'].std():.4f}")
+# Save the updated CSV with averages
+output_csv = Path(output_dir) / "combined_results_with_averages.csv"
+df.to_csv(output_csv, index=False)
+print(f"\nUpdated CSV with averages saved to {output_csv}")
+# Create visualizations
+fig, axes = plt.subplots(2, 2, figsize=(14, 10))
+# 1. Scatter plot: Training vs Testing scores
+ax = axes[0, 0]
+ax.scatter(df_complete['training_score'], df_complete['testing_score'], alpha=0.6, s=50)
+# Add diagonal line for reference (where training == testing)
+lim = [min(df_complete['training_score'].min(), df_complete['testing_score'].min()),
+       max(df_complete['training_score'].max(), df_complete['testing_score'].max())]
+ax.plot(lim, lim, 'r--', alpha=0.5, label='Training = Testing')
+ax.set_xlabel('Training Score')
+ax.set_ylabel('Testing Score')
+ax.set_title('Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3)
+# 2. Distribution comparison - histograms
+ax = axes[0, 1]
+ax.hist(df_complete['training_score'], bins=20, alpha=0.6, label='Training', edgecolor='black')
+ax.hist(df_complete['testing_score'], bins=20, alpha=0.6, label='Testing', edgecolor='black')
+ax.set_xlabel('Score')
+ax.set_ylabel('Frequency')
+ax.set_title('Distribution of Training vs Testing Scores')
+ax.legend()
+ax.grid(True, alpha=0.3, axis='y')
+# 3. Box plot comparison
+ax = axes[1, 0]
+box_data = [df_complete['training_score'], df_complete['testing_score'], df_complete['average_score']]
+bp = ax.boxplot(box_data, labels=['Training', 'Testing', 'Average'])
+ax.set_ylabel('Score')
+ax.set_title('Score Comparison (Box Plot)')
+ax.grid(True, alpha=0.3, axis='y')
+# 4. Difference plot: Training - Testing
+ax = axes[1, 1]
+difference = df_complete['training_score'] - df_complete['testing_score']
+ax.scatter(df_complete['problem_id'].astype(int), difference, alpha=0.6, s=50)
+ax.axhline(y=0, color='r', linestyle='--', alpha=0.5, label='No Difference')
+ax.set_xlabel('Problem ID')
+ax.set_ylabel('Training Score - Testing Score')
+ax.set_title('Score Difference (Training - Testing)')
+ax.legend()
+ax.grid(True, alpha=0.3)
+plt.tight_layout()
+plot_path = Path(output_dir) / "results_analysis.png"
+plt.savefig(plot_path, dpi=300, bbox_inches='tight')
+print(f"Plot saved to {plot_path}")
+# Additional statistics about differences
+print(f"\nScore Differences (Training - Testing):")
+print(f"  Mean Difference: {difference.mean():.4f}")
+print(f"  Median Difference: {difference.median():.4f}")
+print(f"  Std Dev: {difference.std():.4f}")
+print(f"  Problems where training > testing: {(difference > 0).sum()}")
+print(f"  Problems where testing > training: {(difference < 0).sum()}")
+plt.show()

benchmarks/frontier-cs-eval/combine_results.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import csv
+import os
+from pathlib import Path
+# Define paths
+_script_dir = Path(__file__).resolve().parent
+_repo_root = _script_dir.parent.parent
+training_dir = str(_repo_root / "outputs" / "frontier_cs")
+testing_dir = str(_script_dir / "evaluation_results")
+output_csv = str(_script_dir / "combined_results.csv")
+# Collect all problems
+results = []
+# Get all problem directories from training data
+training_problems = sorted([d for d in os.listdir(training_dir) if d.startswith("problem_")])
+print(f"Found {len(training_problems)} training problems")
+for problem_dir in training_problems:
+    problem_id = problem_dir.replace("problem_", "")
+    # Get training score from best_program_info.json
+    training_score = None
+    training_info_path = os.path.join(training_dir, problem_dir, "best", "best_program_info.json")
+    if os.path.exists(training_info_path):
+        try:
+            with open(training_info_path, 'r') as f:
+                training_data = json.load(f)
+                training_score = training_data.get("metrics", {}).get("combined_score")
+        except Exception as e:
+            print(f"Error reading training data for problem {problem_id}: {e}")
+    # Get testing score from evaluation_results json
+    testing_score = None
+    testing_json_path = os.path.join(testing_dir, f"problem_{problem_id}.json")
+    if os.path.exists(testing_json_path):
+        try:
+            with open(testing_json_path, 'r') as f:
+                testing_data = json.load(f)
+                testing_score = testing_data.get("combined_score")
+        except Exception as e:
+            print(f"Error reading testing data for problem {problem_id}: {e}")
+    results.append({
+        "problem_id": problem_id,
+        "training_score": training_score,
+        "testing_score": testing_score
+    })
+# Write to CSV
+with open(output_csv, 'w', newline='') as csvfile:
+    fieldnames = ["problem_id", "training_score", "testing_score"]
+    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+    writer.writeheader()
+    writer.writerows(results)
+print(f"\nResults written to {output_csv}")
+print(f"Total problems: {len(results)}")
+print(f"Problems with both scores: {sum(1 for r in results if r['training_score'] is not None and r['testing_score'] is not None)}")
+print(f"Problems missing training score: {sum(1 for r in results if r['training_score'] is None)}")
+print(f"Problems missing testing score: {sum(1 for r in results if r['testing_score'] is None)}")

benchmarks/frontier-cs-eval/config.yaml ADDED Viewed

	@@ -0,0 +1,57 @@

+# Frontier-CS Benchmark
+# Usage: uv run skydiscover-run initial_program.cpp evaluator.py -c config.yaml -s <strategy> -i 50
+max_iterations: 100
+checkpoint_interval: 10
+log_level: INFO
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  api_base: https://api.openai.com/v1
+  temperature: 0.7
+  # top_p: 0.95  # omitted by default; some providers (e.g. Anthropic) reject both temperature and top_p
+  max_tokens: 32000
+  timeout: 600
+  # To use Gemini: override with --model gemini-3-pro-preview
+prompt:
+  system_message: |
+    You are an expert competitive programmer specializing in algorithmic optimization.
+    PROBLEM STATEMENT:
+    {problem_statement}
+    CONSTRAINTS:
+    {problem_constraints}
+    OBJECTIVE: Maximize the score returned by the Frontier-CS judge (higher is better).
+    Your solution must be valid C++ code that compiles and runs correctly.
+    KEY STRATEGIES:
+    - Analyze the problem structure carefully before coding
+    - Consider time and space complexity constraints
+    - Use efficient data structures (vectors, maps, sets, priority queues)
+    - Implement clean, well-structured code
+    - Handle edge cases properly
+    - Optimize hot loops and critical sections
+    COMMON TECHNIQUES:
+    - Dynamic programming for optimization problems
+    - Greedy algorithms with proper ordering
+    - Graph algorithms (BFS, DFS, shortest paths)
+    - Binary search for monotonic functions
+    - Divide and conquer approaches
+    - Heuristic search (simulated annealing, genetic algorithms, local search)
+    OUTPUT: Complete C++ program with main() function that reads from stdin and writes to stdout.
+evaluator:
+  timeout: 300
+  max_retries: 3
+  cascade_evaluation: false
+diff_based_generation: true
+max_solution_length: 50000
+random_seed: 42

benchmarks/frontier-cs-eval/evaluator.py ADDED Viewed

	@@ -0,0 +1,174 @@

+"""
+Evaluator for Frontier-CS algorithmic problems.
+This evaluator integrates with SkyDiscover to evaluate generated C++ solutions
+against Frontier-CS benchmark problems using the local judge server.
+"""
+import traceback
+from pathlib import Path
+import logging
+import sys
+import os
+import random
+logger = logging.getLogger(__name__)
+# Support multiple judge servers for load balancing
+DEFAULT_JUDGE_URL = "http://localhost:8081"
+JUDGE_URLS = os.environ.get("JUDGE_URLS", DEFAULT_JUDGE_URL).split(",")
+JUDGE_URLS = [url.strip() for url in JUDGE_URLS if url.strip()]
+def get_judge_url() -> str:
+    """Get a judge URL using random selection for load balancing."""
+    return random.choice(JUDGE_URLS)
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+try:
+    from frontier_cs.single_evaluator import SingleEvaluator as FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    raise
+def evaluate(program_path: str, problem_id: str = None, **kwargs) -> dict:
+    """
+    Evaluate a C++ solution for a Frontier-CS algorithmic problem.
+    Args:
+        program_path: Path to the C++ solution file
+        problem_id: Frontier-CS problem ID (e.g., "0", "1", "2", etc.)
+                    If None, will be read from FRONTIER_CS_PROBLEM env var or config
+    Returns:
+        dict with evaluation results:
+            - combined_score: The score from the judge (higher is better)
+            - runs_successfully: 1.0 if evaluation succeeded, 0.0 otherwise
+            - status: Evaluation status string
+            - message: Any error or status messages
+            - problem_id: The problem ID
+            - program_path: Path to the evaluated program
+            - score_unbounded: Unbounded score if available
+            - metadata: Additional evaluation metadata
+    """
+    # Get problem_id from parameter, environment, or kwargs
+    if problem_id is None:
+        import os
+        problem_id = os.environ.get('FRONTIER_CS_PROBLEM')
+        if problem_id is None:
+            problem_id = kwargs.get('frontier_cs_problem', '0')
+    logger.info(f"Evaluating program {program_path} for Frontier-CS problem {problem_id}")
+    try:
+        # Initialize evaluator with judge server (load balanced if multiple configured)
+        judge_url = get_judge_url()
+        logger.info(f"Using judge server: {judge_url}")
+        evaluator = FrontierCSEvaluator(
+            backend="docker",
+            judge_url=judge_url,
+            register_cleanup=False,
+        )
+        # Read the solution code
+        solution_path = Path(program_path)
+        if not solution_path.exists():
+            error_msg = f"Solution file not found: {program_path}"
+            logger.error(error_msg)
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": error_msg,
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        # Extract code and remove any EVOLVE-BLOCK markers
+        code = solution_path.read_text().replace(
+            "// EVOLVE-BLOCK-START", ""
+        ).replace(
+            "// EVOLVE-BLOCK-END", ""
+        ).strip()
+        logger.info(f"Code extracted from {program_path}")
+        # Evaluate the solution
+        result = evaluator.evaluate(
+            track="algorithmic",
+            problem_id=problem_id,
+            code=code,
+            backend="docker",
+        )
+        logger.info(f"Evaluation completed with status: {result.status}")
+        # Process result
+        if result.status == EvaluationStatus.SUCCESS:
+            print(result)
+            score = result.score
+            # Use unbounded score for optimization (allows >100 if beating reference)
+            score_unbounded = result.metadata.get('scoreUnbounded', score) if result.metadata else score
+            print(f"score={score}, score_unbounded={score_unbounded}")
+            # Extract only essential metadata (exclude large test case outputs)
+            essential_metadata = {}
+            if result.metadata:
+                essential_metadata = {
+                    "status": result.metadata.get("status"),
+                    "passed": result.metadata.get("passed"),
+                    "result": result.metadata.get("result"),
+                    "score": result.metadata.get("score"),
+                    "scoreUnbounded": result.metadata.get("scoreUnbounded"),
+                }
+            return {
+                "combined_score": float(score),  # Ensure it's a float
+                "score_unbounded": score_unbounded,
+                "runs_successfully": 1.0,
+                "status": "success",
+                "message": result.message or "Evaluation successful",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "duration_seconds": result.duration_seconds,
+                "metadata": essential_metadata,
+            }
+        elif result.status == EvaluationStatus.TIMEOUT:
+            logger.warning(f"Evaluation timed out: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "timeout",
+                "message": result.message or "Evaluation timed out",
+                "problem_id": problem_id,
+                "program_path": program_path,
+            }
+        else:  # ERROR status
+            logger.error(f"Evaluation error: {result.message}")
+            return {
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "error",
+                "message": result.message or "Evaluation failed",
+                "problem_id": problem_id,
+                "program_path": program_path,
+                "logs": result.logs,
+            }
+    except Exception as e:
+        logger.error(f"Evaluation failed completely: {str(e)}")
+        logger.error(traceback.format_exc())
+        return {
+            "combined_score": 0.0,
+            "runs_successfully": 0.0,
+            "status": "error",
+            "message": str(e),
+            "problem_id": problem_id,
+            "program_path": program_path,
+            "error": str(e),
+        }

benchmarks/frontier-cs-eval/initial_program.cpp ADDED Viewed

	@@ -0,0 +1,6 @@

+#include <bits/stdc++.h>
+using namespace std;
+int main(){
+    std::cout << "Hello, World!" << std::endl;
+    return 0;
+}

benchmarks/frontier-cs-eval/run_all_frontiercs.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import argparse
+import os
+import sys
+import subprocess
+from pathlib import Path
+from concurrent.futures import ProcessPoolExecutor
+from dotenv import load_dotenv
+load_dotenv()
+SCRIPT_DIR = Path(__file__).resolve().parent
+frontier_cs_path = SCRIPT_DIR / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+from frontier_cs.runner.algorithmic_local import AlgorithmicLocalRunner
+def run_single_problem(args):
+    p_id, search, iterations, env = args
+    print(f"\n[START] Problem ID: {p_id}")
+    command = [
+        "uv", "run", "skydiscover-run",
+        "initial_program.cpp", "evaluator.py",
+        "-c", "config.yaml",
+        "-s", search,
+        "-i", str(iterations),
+        "-o", f"outputs/frontier_cs/problem_{p_id}",
+    ]
+    env = {**env, "FRONTIER_CS_PROBLEM": str(p_id)}
+    try:
+        subprocess.run(command, check=True, env=env, cwd=str(SCRIPT_DIR))
+        return f"✅ Problem {p_id} completed."
+    except subprocess.CalledProcessError as e:
+        return f"❌ Problem {p_id} failed: {e}"
+def main():
+    parser = argparse.ArgumentParser(description="Run SkyDiscover on all Frontier-CS problems")
+    parser.add_argument("--search", "-s", default="adaevolve",
+                        help="Search algorithm (default: adaevolve)")
+    parser.add_argument("--iterations", "-i", type=int, default=50,
+                        help="Iterations per problem (default: 50)")
+    parser.add_argument("--workers", "-w", type=int, default=6,
+                        help="Parallel workers (default: 6)")
+    args = parser.parse_args()
+    runner = AlgorithmicLocalRunner()
+    problems_data = runner.list_problems()
+    problem_ids = sorted([p['id'] for p in problems_data['problems']], key=int)
+    print(f"Running {len(problem_ids)} problems with {args.workers} workers "
+          f"(search={args.search}, iterations={args.iterations})...")
+    env = os.environ.copy()
+    task_args = [(p_id, args.search, args.iterations, env) for p_id in problem_ids]
+    with ProcessPoolExecutor(max_workers=args.workers) as executor:
+        results = list(executor.map(run_single_problem, task_args))
+    print("\n" + "=" * 30)
+    print("ALL RUNS COMPLETE")
+    print("=" * 30)
+    for result in results:
+        print(result)
+if __name__ == "__main__":
+    main()

benchmarks/frontier-cs-eval/run_best_programs_frontiercs.py ADDED Viewed

	@@ -0,0 +1,404 @@

+import os
+import sys
+import json
+import logging
+import threading
+from pathlib import Path
+from typing import Dict, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+# Add Frontier-CS to path
+frontier_cs_path = Path(__file__).resolve().parent / "Frontier-CS" / "src"
+if str(frontier_cs_path) not in sys.path:
+    sys.path.insert(0, str(frontier_cs_path))
+try:
+    from frontier_cs.evaluator import FrontierCSEvaluator
+    from frontier_cs.runner.base import EvaluationStatus
+except ImportError as e:
+    logger.error(f"Failed to import Frontier-CS: {e}")
+    logger.error("Please ensure Frontier-CS is installed as a submodule in benchmarks/frontier-cs-eval/Frontier-CS")
+    sys.exit(1)
+class BestProgramEvaluator:
+    """Evaluates all best_program.cpp files in the outputs directory."""
+    def __init__(self, outputs_dir: str, judge_url: str = "http://localhost:8081", num_workers: int = 8):
+        """
+        Initialize the evaluator.
+        Args:
+            outputs_dir: Path to the outputs directory containing problem folders
+            judge_url: URL of the judge server
+            num_workers: Number of parallel workers for evaluation
+        """
+        self.outputs_dir = Path(outputs_dir)
+        self.judge_url = judge_url
+        self.num_workers = num_workers
+        # Use thread-local storage for evaluator instances (avoid race condition)
+        self._evaluator_local = threading.local()
+        self.results = []
+        # Create results directory in the script's directory
+        self.results_dir = Path(__file__).resolve().parent / "evaluation_results"
+        self.results_dir.mkdir(exist_ok=True)
+        logger.info(f"Results will be saved to {self.results_dir}")
+        logger.info(f"Using {self.num_workers} parallel workers with thread-local evaluators")
+    def _get_evaluator(self) -> 'FrontierCSEvaluator':
+        """
+        Get the evaluator for the current thread.
+        Creates a new instance if this thread hasn't created one yet.
+        This avoids race conditions from sharing a single evaluator across threads.
+        """
+        if not hasattr(self._evaluator_local, 'evaluator'):
+            self._evaluator_local.evaluator = FrontierCSEvaluator(
+                backend="docker",
+                judge_url=self.judge_url,
+            )
+            logger.debug(f"Created new evaluator for thread {threading.current_thread().name}")
+        return self._evaluator_local.evaluator
+    def find_best_programs(self) -> Dict[str, Path]:
+        """
+        Find all best_program.cpp files in the outputs directory.
+        Returns:
+            Dict mapping problem_id to best_program.cpp path
+        """
+        best_programs = {}
+        # Look for frontier_cs subdirectory
+        frontier_cs_dir = self.outputs_dir / "frontier_cs"
+        if not frontier_cs_dir.exists():
+            logger.error(f"frontier_cs directory not found at {frontier_cs_dir}")
+            return best_programs
+        # Iterate through problem directories
+        for problem_dir in sorted(frontier_cs_dir.iterdir()):
+            if not problem_dir.is_dir() or not problem_dir.name.startswith("problem_"):
+                continue
+            # Extract problem ID
+            problem_id = problem_dir.name.replace("problem_", "")
+            # Look for best_program.cpp
+            best_program_path = problem_dir / "best" / "best_program.cpp"
+            if best_program_path.exists():
+                best_programs[problem_id] = best_program_path
+                logger.info(f"Found best_program.cpp for problem {problem_id}")
+            else:
+                logger.warning(f"best_program.cpp not found for problem {problem_id} at {best_program_path}")
+        return best_programs
+    def evaluate_program(self, problem_id: str, program_path: Path) -> Dict:
+        """
+        Evaluate a single best_program.cpp file.
+        Args:
+            problem_id: The Frontier-CS problem ID
+            program_path: Path to the best_program.cpp file
+        Returns:
+            Dictionary with evaluation results
+        """
+        logger.info(f"Evaluating problem {problem_id}: {program_path}")
+        try:
+            # Read the solution code
+            if not program_path.exists():
+                error_msg = f"Solution file not found: {program_path}"
+                logger.error(error_msg)
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "error",
+                    "message": error_msg,
+                }
+            # Read the code
+            code = program_path.read_text().replace(
+                "// EVOLVE-BLOCK-START", ""
+            ).replace(
+                "// EVOLVE-BLOCK-END", ""
+            ).strip()
+            logger.info(f"Code extracted from {program_path}, length: {len(code)} characters")
+            # Evaluate the solution (use thread-local evaluator)
+            evaluator = self._get_evaluator()
+            result = evaluator.evaluate(
+                track="algorithmic",
+                problem_id=problem_id,
+                code=code,
+                backend="docker",
+            )
+            logger.info(f"Evaluation completed for problem {problem_id} with status: {result.status}")
+            # Log the result object and its properties
+            logger.info(f"Judger output for problem {problem_id}:")
+            logger.info(f"  Status: {result.status}")
+            logger.info(f"  Message: {result.message}")
+            if hasattr(result, 'score'):
+                logger.info(f"  Score: {result.score}")
+            if hasattr(result, 'duration_seconds'):
+                logger.info(f"  Duration: {result.duration_seconds}s")
+            if hasattr(result, 'metadata'):
+                logger.info(f"  Metadata: {result.metadata}")
+            logger.info(f"  Full result object: {result}")
+            # Process result
+            if result.status == EvaluationStatus.SUCCESS:
+                score = result.score
+                logger.info(f"Problem {problem_id}: Score = {score}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": float(score),
+                    "runs_successfully": 1.0,
+                    "status": "success",
+                    "message": result.message or "Evaluation successful",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                    "metadata": result.metadata if hasattr(result, 'metadata') else None,
+                }
+            elif result.status == EvaluationStatus.TIMEOUT:
+                logger.warning(f"Problem {problem_id}: Evaluation timed out")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "timeout",
+                    "message": f"Evaluation timed out: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            elif result.status == EvaluationStatus.COMPILATION_ERROR:
+                logger.warning(f"Problem {problem_id}: Compilation error")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "compilation_error",
+                    "message": f"Compilation error: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+            else:
+                logger.error(f"Problem {problem_id}: Evaluation failed with status {result.status}")
+                return {
+                    "problem_id": problem_id,
+                    "program_path": str(program_path),
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": str(result.status),
+                    "message": f"Evaluation failed: {result.message}",
+                    "duration_seconds": result.duration_seconds,
+                    "judger_output": str(result),
+                }
+        except Exception as e:
+            logger.error(f"Exception while evaluating problem {problem_id}: {str(e)}")
+            logger.error(f"Exception traceback: {type(e).__name__}")
+            import traceback
+            logger.error(traceback.format_exc())
+            return {
+                "problem_id": problem_id,
+                "program_path": str(program_path),
+                "combined_score": 0.0,
+                "runs_successfully": 0.0,
+                "status": "exception",
+                "message": str(e),
+            }
+    def run_all_evaluations(self) -> List[Dict]:
+        """
+        Run evaluations for all best_program.cpp files sequentially (one at a time).
+        Returns:
+            List of evaluation results
+        """
+        logger.info(f"Starting evaluation of all best programs in {self.outputs_dir}")
+        best_programs = self.find_best_programs()
+        logger.info(f"Found {len(best_programs)} best_program.cpp files")
+        if not best_programs:
+            logger.warning("No best_program.cpp files found!")
+            return []
+        # Sort problems by ID for consistent ordering
+        sorted_problems = sorted(best_programs.items(), key=lambda x: int(x[0]))
+        # Evaluate each program sequentially (no parallelization)
+        results = []
+        total = len(sorted_problems)
+        for idx, (problem_id, program_path) in enumerate(sorted_problems, 1):
+            logger.info(f"[SEQ] Evaluating problem {problem_id} ({idx}/{total})")
+            try:
+                result = self.evaluate_program(problem_id, program_path)
+                # CRITICAL: Ensure problem_id matches
+                if result.get("problem_id") != problem_id:
+                    logger.error(f"[CRITICAL] Problem ID MISMATCH! Expected {problem_id}, got {result.get('problem_id')}")
+                    result["problem_id"] = problem_id  # Force correct problem_id
+                results.append(result)
+                self.results.append(result)
+                logger.info(f"[SAVE] Saving problem {problem_id} result to file")
+                # Save result immediately after evaluation
+                self.save_problem_result(result)
+            except Exception as e:
+                logger.error(f"Exception evaluating problem {problem_id}: {str(e)}")
+                import traceback
+                logger.error(traceback.format_exc())
+                error_result = {
+                    "problem_id": problem_id,
+                    "combined_score": 0.0,
+                    "runs_successfully": 0.0,
+                    "status": "exception",
+                    "message": str(e),
+                }
+                results.append(error_result)
+                self.results.append(error_result)
+                self.save_problem_result(error_result)
+        return results
+    def save_results(self, output_file: str = "evaluation_results.json"):
+        """
+        Save evaluation results to a JSON file.
+        Args:
+            output_file: Path to save the results
+        """
+        output_path = Path(output_file)
+        with open(output_path, 'w') as f:
+            json.dump(self.results, f, indent=2)
+        logger.info(f"Results saved to {output_path}")
+    def save_problem_result(self, result: Dict):
+        """
+        Save individual problem result to a separate file.
+        Args:
+            result: The evaluation result for a single problem
+        """
+        problem_id = result.get("problem_id", "unknown")
+        result_file = self.results_dir / f"problem_{problem_id}.json"
+        with open(result_file, 'w') as f:
+            json.dump(result, f, indent=2)
+        logger.info(f"Problem {problem_id} result saved to {result_file}")
+    def print_summary(self):
+        """Print a summary of the evaluation results."""
+        if not self.results:
+            logger.info("No results to summarize")
+            return
+        logger.info("\n" + "="*80)
+        logger.info("EVALUATION SUMMARY")
+        logger.info("="*80)
+        successful = [r for r in self.results if r.get("status") == "success"]
+        timeout = [r for r in self.results if r.get("status") == "timeout"]
+        compilation_error = [r for r in self.results if r.get("status") == "compilation_error"]
+        other_error = [r for r in self.results if r.get("status") not in ["success", "timeout", "compilation_error"]]
+        logger.info(f"Total problems evaluated: {len(self.results)}")
+        logger.info(f"Successful: {len(successful)}")
+        logger.info(f"Timeouts: {len(timeout)}")
+        logger.info(f"Compilation errors: {len(compilation_error)}")
+        logger.info(f"Other errors: {len(other_error)}")
+        if successful:
+            scores = [r["combined_score"] for r in successful]
+            logger.info(f"\nSuccessful evaluation scores:")
+            logger.info(f"  Average score: {sum(scores) / len(scores):.2f}")
+            logger.info(f"  Min score: {min(scores):.2f}")
+            logger.info(f"  Max score: {max(scores):.2f}")
+            logger.info(f"\nTop 5 problems by score:")
+            top_5 = sorted(successful, key=lambda r: r["combined_score"], reverse=True)[:5]
+            for i, result in enumerate(top_5, 1):
+                logger.info(f"  {i}. Problem {result['problem_id']}: {result['combined_score']:.2f}")
+        logger.info("="*80 + "\n")
+def main():
+    """Main entry point."""
+    import argparse
+    parser = argparse.ArgumentParser(
+        description="Evaluate all best_program.cpp files in the outputs directory"
+    )
+    # Default outputs directory is two levels up from this script
+    default_outputs_dir = Path(__file__).resolve().parent.parent.parent / "outputs"
+    parser.add_argument(
+        "--outputs-dir",
+        type=str,
+        default=str(default_outputs_dir),
+        help="Path to the outputs directory (default: ../../outputs from script location)"
+    )
+    parser.add_argument(
+        "--judge-url",
+        type=str,
+        default="http://localhost:8081",
+        help="URL of the judge server (default: http://localhost:8081)"
+    )
+    parser.add_argument(
+        "--output-file",
+        type=str,
+        default="evaluation_results.json",
+        help="Path to save the evaluation results (default: evaluation_results.json)"
+    )
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=8,
+        help="Number of parallel workers for evaluation (default: 8)"
+    )
+    args = parser.parse_args()
+    # Run evaluations
+    evaluator = BestProgramEvaluator(
+        outputs_dir=args.outputs_dir,
+        judge_url=args.judge_url,
+        num_workers=args.workers
+    )
+    results = evaluator.run_all_evaluations()
+    evaluator.save_results(args.output_file)
+    evaluator.print_summary()
+    logger.info(f"Evaluation complete. Results saved to {args.output_file}")
+if __name__ == "__main__":
+    main()

benchmarks/image_gen/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+# Image Generation Benchmark
+This benchmark evaluates whether SkyDiscover can optimize images, not just code or text. Each "solution" in the population is an image, evolved by generating and scoring variants from a candidate pool stored in the database. The evolutionary loop is the same as for code — parent selection, mutation via LLM, crossover via other context images from other islands — but instead of evolving Python programs, SkyDiscover evolves text prompts fed to GPT-5's native image generation. The VLM receives actual parent and other context images alongside text guidance, reasons about what to improve, and generates a new image. Setting `language: "image"` in the config is the only change needed.
+## Benchmark: Sky Festival
+**Directory:** `sky_festival/`
+The system must generate a floating sky-festival image where many details must match exact structural constraints: 9 clouds with specific shapes (rabbit, teacup, musical note, crescent moon, whale, etc.), 5 hot-air balloons with exact colors, passengers, and a banner reading "HAPPY 100TH SKY FESTIVAL", a floating island with 4 trees in a specific left-to-right order, and a party table with precisely counted items (6 cupcakes, 8 golden plates, 5 gift boxes in a pyramid). The scene also includes 6 characters with specific attributes (e.g., a robot with 3 colored buttons on its chest, a grandmother giving a thumbs-up with her left hand), flying creatures, and a correctly ordered 7-band rainbow. The full specification is about 2000 words and lives in `config.yaml`'s `prompt.system_message`.
+**Evaluator.** Each generated image is graded by a GPT-5 vision judge using a strict rubric. The judge receives the image and a detailed scoring sheet, then returns per-category scores across 7 dimensions — cloud shapes (15 pts), balloons (20 pts), floating island (10 pts), table items (20 pts), characters (15 pts), decorations/creatures (10 pts), and rainbow/lighting (10 pts) — for a total of 100 points. The judge is instructed to be extremely harsh: points are awarded only when requirements are clearly and unambiguously met in the image.
+## Setup
+1. **Set your API key:**
+   ```bash
+   export OPENAI_API_KEY=...
+   ```
+   Both the image generator (GPT-5) and the evaluator judge (GPT-5) use the OpenAI API.
+## Run
+```bash
+cd benchmarks/image_gen/sky_festival
+# AdaEvolve
+uv run skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+# EvoX
+uv run skydiscover-run evaluator.py -c config.yaml -s evox -o sky_festival_output
+```
+## Files
+| File | Description |
+|------|-------------|
+| `sky_festival/evaluator.py` | GPT-5 vision judge that scores images against the 100-point rubric |
+| `sky_festival/config.yaml` | Config — scene specification in `prompt.system_message` |

benchmarks/image_gen/sky_festival/config.yaml ADDED Viewed

	@@ -0,0 +1,103 @@

+# Sky Festival Benchmark
+#
+# Usage:
+#   cd benchmarks/image_gen/sky_festival
+#   skydiscover-run evaluator.py -c config.yaml -s adaevolve -o sky_festival_output
+language: "image"
+diff_based_generation: false
+max_iterations: 100
+checkpoint_interval: 1
+llm:
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  temperature: 0.9
+  max_tokens: 16384
+  timeout: 300
+evaluator:
+  timeout: 300
+prompt:
+  system_message: |
+    You are an expert visual artist and image generation AI specializing in
+    complex compositional scenes with precise object counting, spatial
+    arrangement, and rich detail.
+    You can see the current images from the database along with their scores
+    across 7 categories: cloud shapes, hot air balloons, floating island,
+    table items, characters, decorations/creatures, and rainbow/lighting.
+    Your goal is to generate a NEW, improved image that scores higher on
+    the rubric. Pay special attention to:
+    - EXACT counts: 9 shaped clouds, 5 balloons, 4 trees, 6 cupcakes, 8 plates, 5 gifts, 6 characters, 11 bunting flags, 7 lanterns, 7 rainbow bands
+    - Correct passengers in each balloon (2 children, 1 woman, 3 cats, 1 violinist, empty)
+    - Legible text: "HAPPY 100TH SKY FESTIVAL" on banner, "100 YEARS" on cake
+    - Specific character details: robot buttons, grandmother's LEFT hand thumbs-up, dog's striped hat
+    - Correct spatial ordering: trees left-to-right, gift pyramid, cupcake grid
+    - Warm golden lighting from upper left, consistent shadows
+    Also provide brief text reasoning about your approach and what you changed.
+    # Target Image Description
+    A joyful, sunlit floating sky festival on a perfect summer day, viewed from a slightly elevated angle.
+    THE SKY AND BACKGROUND:
+    The sky is a brilliant gradient from warm gold at the horizon to deep cerulean blue at the top. There are exactly 9 fluffy white clouds scattered across the sky. Each cloud has a distinct shape: cloud 1 looks like a rabbit, cloud 2 looks like a teacup, cloud 3 looks like a musical note, cloud 4 looks like a crescent moon, cloud 5 looks like a whale, cloud 6 looks like a bicycle, cloud 7 looks like a crown, cloud 8 looks like a butterfly, cloud 9 looks like the number 7. The clouds are arranged in a gentle arc from left to right across the upper third of the image.
+    THE HOT AIR BALLOONS:
+    There are exactly 5 hot air balloons floating at different heights. Each balloon has a unique color and pattern:
+    - Balloon 1 (leftmost, highest): Red with white horizontal stripes. Its basket carries exactly 2 waving children.
+    - Balloon 2 (second from left, medium height): Sunshine yellow with orange polka dots. Its basket carries exactly 1 old woman holding a telescope.
+    - Balloon 3 (center, lowest): Rainbow gradient (red-orange-yellow-green-blue-purple from top to bottom). Its basket carries exactly 3 cats — one orange tabby, one black, one white — all wearing tiny party hats.
+    - Balloon 4 (second from right, medium height): Deep purple with gold stars printed on it. Its basket carries exactly 1 man playing a violin.
+    - Balloon 5 (rightmost, highest): Emerald green with a large white peace sign on the front. Its basket is empty but has a banner hanging from it that reads exactly: "HAPPY 100TH SKY FESTIVAL"
+    THE FLOATING ISLAND:
+    Below the balloons, there is a lush green floating island suspended in mid-air. The island is roughly circular and has grass, wildflowers, and 4 trees on it. The trees are different species: one oak with a thick trunk, one cherry blossom in full pink bloom, one palm tree leaning slightly right, and one pine tree (tallest of the four). The trees are spaced evenly along the island from left to right in that exact order: oak, cherry blossom, palm, pine.
+    THE PARTY TABLE:
+    On the center of the floating island sits a long rectangular wooden table covered with a checkered red-and-white tablecloth. On the table, from left to right:
+    - A 3-tier birthday cake with white frosting. The bottom tier has blue frosting roses, the middle tier has pink frosting roses, the top tier has a single golden candle that is lit with a bright flame. Written on the middle tier in purple icing: "100 YEARS"
+    - Exactly 6 cupcakes arranged in 2 rows of 3. Each cupcake has a different colored frosting: red, orange, yellow, green, blue, purple (in that order, left to right, top row first).
+    - A glass pitcher of lemonade, three-quarters full, with exactly 3 lemon slices floating in it and 2 ice cubes visible.
+    - A stack of exactly 8 golden plates.
+    - Exactly 5 colorful gift boxes stacked in a pyramid: 3 on the bottom row (red, blue, green from left to right), 2 on top (yellow, purple from left to right). Each gift box has a white ribbon bow on top.
+    THE CHARACTERS AROUND THE TABLE:
+    Seated around the table are exactly 6 characters, 3 on each long side facing each other:
+    - Left side (facing right), from left to right: A smiling girl with pigtails wearing a blue dress, a jolly round penguin wearing a red bowtie, and a tall giraffe whose long neck extends above the frame but whose smiling face peeks down from above.
+    - Right side (facing left), from left to right: A friendly robot with a square head and glowing green eyes, a grandmother in a floral apron giving a thumbs-up with her LEFT hand, and a golden retriever dog sitting upright on a chair wearing a cone-shaped party hat with blue and white stripes.
+    THE BUNTING AND DECORATIONS:
+    Strung between the cherry blossom tree and the pine tree is a triangular bunting banner with exactly 11 small triangular flags. The flags alternate in color: red, yellow, blue, red, yellow, blue, red, yellow, blue, red, yellow. Below the bunting, there are exactly 7 paper lanterns hanging at different heights. The lanterns are spherical and glow warmly in these colors from left to right: orange, pink, gold, white, lavender, mint green, coral.
+    THE ANIMALS IN THE SKY:
+    Flying around the balloons are exactly 4 birds and 2 butterflies. The birds are: 1 blue jay, 1 cardinal (red), 1 canary (yellow), and 1 hummingbird (iridescent green). The 2 butterflies are: one monarch (orange and black) and one morpho (brilliant blue). The blue jay and the cardinal are flying together near Balloon 2. The canary is perched on top of Balloon 4. The hummingbird hovers near the cherry blossom tree. The monarch butterfly is near the bunting. The morpho butterfly is near Balloon 5.
+    THE FLOATING MUSICAL NOTES:
+    Drifting upward from the violin player in Balloon 4, there are exactly 5 golden musical notes of different sizes, getting smaller as they rise higher. They follow a gentle curved path upward and to the right.
+    THE RAINBOW:
+    Behind everything, a complete semicircular rainbow arcs from the lower left to the lower right of the scene. It has the correct 7 color bands in order from outside to inside: red, orange, yellow, green, blue, indigo, violet.
+    LIGHTING AND ATMOSPHERE:
+    The scene is lit by warm, golden afternoon sunlight coming from the upper left. All shadows fall to the lower right. The overall mood is magical, celebratory, and full of wonder. There is a soft, warm glow around the floating island. The light catches the glass lemonade pitcher creating a small sparkle. The golden candle flame on the cake emits a tiny warm glow.
+    IMPORTANT DETAILS:
+    - The girl with pigtails has exactly 5 fingers visible on each hand.
+    - The robot has exactly 3 buttons on its chest: a red circle, a green square, and a blue triangle, arranged vertically.
+    - The grandmother's floral apron has exactly sunflowers on it, not roses or daisies.
+    - Every character at the table who has a mouth is smiling.
+    - The penguin's red bowtie has white polka dots on it.
+monitor:
+  enabled: true
+  port: 8765
+  summary_model: "gpt-5"
+  summary_interval: 5
+hil_enabled: true
+hil_mode: "append"

benchmarks/image_gen/sky_festival/evaluator.py ADDED Viewed

	@@ -0,0 +1,220 @@

+"""
+Sky Festival evaluator — GPT-5 LLM-as-a-judge.
+Scores VLM-generated images against a 100-point rubric using GPT-5 vision.
+Returns combined_score normalized to [0, 1].
+The framework passes the image path via a sidecar file:
+    <program_path>.image_path  ->  absolute path to the generated image
+Requirements:
+    pip install openai
+    Environment: OPENAI_API_KEY (required), JUDGE_MODEL (optional, default gpt-5)
+"""
+import base64
+import json
+import logging
+import os
+import re
+from typing import Dict, Union
+logger = logging.getLogger(__name__)
+JUDGE_MODEL = os.environ.get("JUDGE_MODEL", "gpt-5")
+SYSTEM_PROMPT = """\
+You are an extremely strict image evaluation judge. You score images against a precise rubric.
+You must output ONLY valid JSON with the exact keys specified. No markdown, no explanation outside JSON.
+Be harsh — most AI-generated images fail these criteria. Award points only when clearly met.
+If you cannot verify a requirement (e.g., too small to see), award 0 for that item."""
+RUBRIC_PROMPT = """\
+Score this image against the following rubric for a "Floating Sky Festival" scene.
+Be extremely strict. Only award points when requirements are CLEARLY and UNAMBIGUOUSLY met.
+## Category 1: Cloud Counting and Shapes (15 pts)
+- Exactly 9 clouds visible in the sky: 5 pts (8 or 10 clouds = 0)
+- At least 5 of the 9 clouds have recognizable distinct shapes (rabbit, teacup, musical note, crescent moon, whale, bicycle, crown, butterfly, number 7): 10 pts (2 pts per recognizable shape, max 10)
+## Category 2: Hot Air Balloons — Count, Colors, and Passengers (20 pts)
+- Exactly 5 hot air balloons visible: 4 pts (4 or 6 = 0)
+- Each balloon has correct distinct color/pattern (red-striped, yellow-dotted, rainbow, purple-stars, green-peace-sign): 6 pts (deduct 2 per wrong/missing pattern)
+- Correct passenger count per balloon (2 children, 1 woman, 3 cats, 1 violinist, empty): 6 pts (deduct 2 per wrong count)
+- Banner on Balloon 5 reads exactly "HAPPY 100TH SKY FESTIVAL": 4 pts (any word wrong = 0)
+## Category 3: Floating Island and Trees (10 pts)
+- Floating island visible suspended in air: 3 pts
+- Exactly 4 different trees on the island: 4 pts (3 or 5 = 0)
+- Trees in correct order left to right (oak, cherry blossom, palm, pine): 3 pts
+## Category 4: Party Table Items — Counting and Arrangement (20 pts)
+- 3-tier cake with candle present: 3 pts
+- Cake text "100 YEARS" legible on middle tier: 3 pts
+- Exactly 6 cupcakes in 2 rows of 3 with different colored frostings: 4 pts
+- Lemonade pitcher with 3 lemon slices and 2 ice cubes: 3 pts
+- Stack of exactly 8 golden plates: 3 pts
+- Exactly 5 gift boxes in pyramid (3 bottom, 2 top): 4 pts
+## Category 5: Characters — Count, Identity, and Details (15 pts)
+- Exactly 6 characters seated at the table (3 per side): 5 pts
+- Correct characters identifiable (girl with pigtails, penguin with bowtie, giraffe, robot, grandmother, golden retriever): 5 pts (1 pt per correct character, max 5 — giraffe counts as 1 even if neck extends)
+- Specific details: robot has 3 colored buttons on chest, grandmother thumbs-up with LEFT hand, dog wears striped party hat, girl has 5 fingers per hand: 5 pts (deduct 1.5 per missing detail)
+## Category 6: Decorations and Flying Creatures (10 pts)
+- Bunting banner with approximately 11 flags in alternating red/yellow/blue: 3 pts
+- Exactly 7 paper lanterns in different colors: 3 pts
+- Correct flying creatures: 4 birds (blue jay, cardinal, canary, hummingbird) + 2 butterflies (monarch, morpho): 4 pts (1 pt per 2 correct creatures)
+## Category 7: Rainbow, Lighting, and Overall Composition (10 pts)
+- Complete semicircular rainbow with 7 color bands in correct order: 4 pts
+- Consistent warm golden lighting from upper left with shadows falling lower right: 3 pts
+- Overall magical/celebratory mood, scene is joyful and cohesive: 3 pts
+Respond with ONLY this JSON (no other text):
+{
+  "cloud_shapes": <0-15>,
+  "balloons": <0-20>,
+  "floating_island": <0-10>,
+  "table_items": <0-20>,
+  "characters": <0-15>,
+  "decorations_creatures": <0-10>,
+  "rainbow_lighting": <0-10>,
+  "reasoning": "<brief 2-3 sentence explanation>"
+}"""
+# Category maximum scores for validation
+CATEGORY_MAXES = {
+    "cloud_shapes": 15,
+    "balloons": 20,
+    "floating_island": 10,
+    "table_items": 20,
+    "characters": 15,
+    "decorations_creatures": 10,
+    "rainbow_lighting": 10,
+}
+_client = None
+def _get_client():
+    global _client
+    if _client is None:
+        from openai import OpenAI
+        _client = OpenAI()
+    return _client
+def _encode_image(image_path: str) -> str:
+    with open(image_path, "rb") as f:
+        return base64.b64encode(f.read()).decode("utf-8")
+def _judge_image(image_path: str) -> Dict[str, Union[float, str]]:
+    """Call GPT-5 to score the image. Retries once on failure."""
+    client = _get_client()
+    b64 = _encode_image(image_path)
+    ext = os.path.splitext(image_path)[1].lstrip(".").lower()
+    mime = {"png": "image/png", "jpg": "image/jpeg", "jpeg": "image/jpeg", "webp": "image/webp"}.get(ext, "image/png")
+    data_url = f"data:{mime};base64,{b64}"
+    messages = [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {
+            "role": "user",
+            "content": [
+                {"type": "image_url", "image_url": {"url": data_url, "detail": "high"}},
+                {"type": "text", "text": RUBRIC_PROMPT},
+            ],
+        },
+    ]
+    last_error = None
+    for attempt in range(2):
+        try:
+            response = client.chat.completions.create(
+                model=JUDGE_MODEL,
+                messages=messages,
+                max_completion_tokens=16384,
+            )
+            content = response.choices[0].message.content or ""
+            raw = content.strip()
+            logger.info(f"Judge raw response (first 300 chars): {raw[:300]}")
+            # Extract JSON from markdown code block if present
+            if "```" in raw:
+                m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
+                if m:
+                    raw = m.group(1).strip()
+            # Find JSON object in response
+            start = raw.find("{")
+            end = raw.rfind("}") + 1
+            if start >= 0 and end > start:
+                raw = raw[start:end]
+            result = json.loads(raw)
+            # Validate and clamp scores
+            scores = {}
+            for cat, max_val in CATEGORY_MAXES.items():
+                val = result.get(cat, 0)
+                if not isinstance(val, (int, float)):
+                    val = 0
+                scores[cat] = max(0, min(max_val, float(val)))
+            scores["reasoning"] = str(result.get("reasoning", ""))
+            return scores
+        except Exception as e:
+            last_error = e
+            logger.warning(f"Judge attempt {attempt + 1} failed: {e}")
+    logger.error(f"GPT-5 judge failed after retries: {last_error}")
+    return {cat: 0.0 for cat in CATEGORY_MAXES}
+def evaluate(program_path: str) -> Dict[str, Union[float, str]]:
+    """Score a VLM-generated image using GPT-5 as judge.
+    Args:
+        program_path: Path to the text file (VLM reasoning).
+            A sidecar file ``<program_path>.image_path`` contains the
+            absolute path to the generated image.
+    Returns:
+        Dictionary with combined_score (0-1), per-category scores, and image_path.
+    """
+    # Read image path from sidecar
+    sidecar = program_path + ".image_path"
+    image_path = None
+    if os.path.exists(sidecar):
+        with open(sidecar) as f:
+            image_path = f.read().strip()
+    if not image_path or not os.path.exists(image_path):
+        logger.warning("No image found for scoring")
+        return {"combined_score": 0.0, "error": "No image to score"}
+    # Score with GPT-5
+    scores = _judge_image(image_path)
+    # Compute total out of 100, normalize to 0-1
+    total = sum(v for k, v in scores.items() if k in CATEGORY_MAXES)
+    combined = round(total / 100.0, 4)
+    result = {"combined_score": combined, "image_path": image_path}
+    # Add per-category scores (normalized to 0-1 for each category)
+    for cat, max_val in CATEGORY_MAXES.items():
+        result[cat] = round(scores.get(cat, 0) / max_val, 4)
+    # Also store raw scores
+    result["raw_total"] = round(total, 1)
+    reasoning = scores.get("reasoning", "")
+    if reasoning:
+        result["judge_reasoning"] = reasoning
+    return result

benchmarks/math/circle_packing_rect/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,119 @@

+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the circle packing problem on a rectangle
+# of perimeter 4.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+import time
+import numpy as np
+import sys
+import os
+from importlib import __import__
+BENCHMARK = 2.3658321334167627
+NUM_CIRCLES = 21
+TOL = 1e-6
+def minimum_circumscribing_rectangle(circles: np.ndarray):
+    """Returns the width and height of the minimum circumscribing rectangle.
+    Args:
+    circles: A numpy array of shape (num_circles, 3), where each row is of the
+        form (x, y, radius), specifying a circle.
+    Returns:
+    A tuple (width, height) of the minimum circumscribing rectangle.
+    """
+    min_x = np.min(circles[:, 0] - circles[:, 2])
+    max_x = np.max(circles[:, 0] + circles[:, 2])
+    min_y = np.min(circles[:, 1] - circles[:, 2])
+    max_y = np.max(circles[:, 1] + circles[:, 2])
+    return max_x - min_x, max_y - min_y
+def validate_packing_radii(radii: np.ndarray) -> None:
+    n = len(radii)
+    for i in range(n):
+        if radii[i] < 0:
+            raise ValueError(f"Circle {i} has negative radius {radii[i]}")
+        elif np.isnan(radii[i]):
+            raise ValueError(f"Circle {i} has nan radius")
+def validate_packing_overlap_wtol(circles: np.ndarray, tol: float = 1e-6) -> None:
+    n = len(circles)
+    for i in range(n):
+        for j in range(i + 1, n):
+            dist = np.sqrt(np.sum((circles[i, :2] - circles[j, :2]) ** 2))
+            if dist < circles[i, 2] + circles[j, 2] - tol:
+                raise ValueError(
+                    f"Circles {i} and {j} overlap: dist={dist}, r1+r2={circles[i,2]+circles[j,2]}"
+                )
+def validate_packing_inside_rect_wtol(circles: np.array, tol: float = 1e-6) -> None:
+    width, height = minimum_circumscribing_rectangle(circles)
+    if width + height > (2 + tol):
+        raise ValueError("Circles are not contained inside a rectangle of perimeter 4.")
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+        circles = None
+        eval_time = 0
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            circles = program.circle_packing21()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+        if not isinstance(circles, np.ndarray):
+            circles = np.array(circles)
+        if circles.shape != (NUM_CIRCLES, 3):
+            raise ValueError(
+                f"Invalid shapes: circles = {circles.shape}, expected {(NUM_CIRCLES,3)}"
+            )
+        validate_packing_radii(circles[:, -1])
+        validate_packing_overlap_wtol(circles, TOL)
+        validate_packing_inside_rect_wtol(circles, TOL)
+        radii_sum = np.sum(circles[:, -1])
+        return {
+            "radii_sum": float(radii_sum),
+            "combined_score": float(radii_sum / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(evaluate)

benchmarks/math/erdos_min_overlap/config.yaml ADDED Viewed

	@@ -0,0 +1,41 @@

+# Math benchmark: erdos_min_overlap
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: |
+    SETTING:
+    You are an expert in harmonic analysis, numerical optimization, and AI-driven mathematical discovery.
+    Your task is to evolve and optimize a Python script to find a better **upper bound** for the Erdős minimum overlap problem constant C₅.
+    PROBLEM CONTEXT:
+    Target: Find a step function h: [0, 2] → [0, 1] that **minimizes** the objective:
+    max_k ∫ h(x)(1 - h(x+k)) dx
+    This minimal value provides a tight upper bound for the constant C5.
+    Current best known upper bound: C5 ≤ 0.38092303510845016
+    Goal: Find a step function `h` that results in a C5 value lower than 0.38092303510845016.
+    CONSTRAINTS:
+    1. The function `h` must have values in the range [0, 1].
+    2. The integral of h(x) over [0, 2] must be exactly 1.
+    PERFORMANCE METRICS:
+    - c5_bound: The bound found by the program.
+    - combined_score: 0.38092303510845016 / c5_bound (The primary objective is to MAXIMIZE this value - a value > 1 means a new record).
+    - n_points: number of points used in the discretization.
+    - eval_time: evaluation time of the program.
+evaluator:
+  timeout: 600
+  max_retries: 3

benchmarks/math/erdos_min_overlap/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/math/erdos_min_overlap/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy
+jax
+optax

benchmarks/math/erdos_min_overlap/initial_program.py ADDED Viewed

	@@ -0,0 +1,96 @@

+# EVOLVE-BLOCK-START
+import jax
+import jax.numpy as jnp
+import optax
+import numpy as np
+from dataclasses import dataclass
+import tqdm
+@dataclass
+class Hyperparameters:
+    num_intervals: int = 200
+    learning_rate: float = 0.005
+    num_steps: int = 20000
+    penalty_strength: float = 1000000.0
+class ErdosOptimizer:
+    """
+    Finds a step function h that minimizes the maximum overlap integral.
+    """
+    def __init__(self, hypers: Hyperparameters):
+        self.hypers = hypers
+        self.domain_width = 2.0
+        self.dx = self.domain_width / self.hypers.num_intervals
+    def _objective_fn(self, latent_h_values: jnp.ndarray) -> jnp.ndarray:
+        """
+        The loss function includes the objective and a penalty for the constraint.
+        """
+        # Enforce h(x) in [0, 1] via sigmoid (hard constraint)
+        h = jax.nn.sigmoid(latent_h_values)
+        # Calculate the primary objective (max correlation)
+        j = 1.0 - h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        scaled_correlation = correlation * self.dx
+        objective_loss = jnp.max(scaled_correlation)
+        # Calculate the penalty for the integral constraint
+        integral_h = jnp.sum(h) * self.dx
+        constraint_loss = (integral_h - 1.0) ** 2
+        # Combine the objective with the penalty
+        total_loss = objective_loss + self.hypers.penalty_strength * constraint_loss
+        return total_loss
+    def run_optimization(self):
+        optimizer = optax.adam(self.hypers.learning_rate)
+        key = jax.random.PRNGKey(42)
+        latent_h_values = jax.random.normal(key, (self.hypers.num_intervals,))
+        opt_state = optimizer.init(latent_h_values)
+        @jax.jit
+        def train_step(latent_h_values, opt_state):
+            loss, grads = jax.value_and_grad(self._objective_fn)(latent_h_values)
+            updates, opt_state = optimizer.update(grads, opt_state)
+            latent_h_values = optax.apply_updates(latent_h_values, updates)
+            return latent_h_values, opt_state, loss
+        print(f"Optimizing a step function with {self.hypers.num_intervals} intervals...")
+        for step in tqdm.tqdm(range(self.hypers.num_steps), desc="Optimizing"):
+            latent_h_values, opt_state, loss = train_step(latent_h_values, opt_state)
+        # Final h is just the sigmoid of the latent values
+        final_h = jax.nn.sigmoid(latent_h_values)
+        # Re-calculate final objective loss without the penalty for the report
+        j = 1.0 - final_h
+        N = self.hypers.num_intervals
+        h_padded = jnp.pad(final_h, (0, N))
+        j_padded = jnp.pad(j, (0, N))
+        corr_fft = jnp.fft.fft(h_padded) * jnp.conj(jnp.fft.fft(j_padded))
+        correlation = jnp.fft.ifft(corr_fft).real
+        c5_bound = jnp.max(correlation * self.dx)
+        print(f"Optimization complete. Final C5 upper bound: {c5_bound:.8f}")
+        return np.array(final_h), float(c5_bound)
+def run():
+    hypers = Hyperparameters()
+    optimizer = ErdosOptimizer(hypers)
+    final_h_values, c5_bound = optimizer.run_optimization()
+    return final_h_values, c5_bound, hypers.num_intervals
+# EVOLVE-BLOCK-END

benchmarks/math/heilbronn_convex/13/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/math/heilbronn_convex/13/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/math/hexagon_packing/11/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/math/matmul/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/math/matmul/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/math/matmul/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for the matrix multiplication problem with tensor size
+# of <2,4,5>
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+import sys
+import os
+from importlib import __import__
+import time
+import numpy as np
+BENCHMARK = 32
+def verify_tensor_decomposition(
+    decomposition: tuple[np.ndarray, np.ndarray, np.ndarray], n: int, m: int, p: int, rank: int
+):
+    """Verifies the correctness of the tensor decomposition."""
+    # Add robustness for cases where the optimizer might fail
+    if not all(isinstance(arr, np.ndarray) for arr in decomposition) or not decomposition:
+        raise ValueError("Decomposition must be a tuple of NumPy arrays.")
+    if any(arr.size == 0 for arr in decomposition):
+        print("Warning: One or more decomposition arrays are empty. Verification skipped.")
+        return
+    # Check that each factor matrix has the correct shape.
+    factor_matrix_1, factor_matrix_2, factor_matrix_3 = decomposition
+    if factor_matrix_1.shape != (n * m, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 1 is {(n * m, rank)}. Actual shape is {factor_matrix_1.shape}."
+        )
+    if factor_matrix_2.shape != (m * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 2 is {(m * p, rank)}. Actual shape is {factor_matrix_2.shape}."
+        )
+    if factor_matrix_3.shape != (n * p, rank):
+        raise ValueError(
+            f"Expected shape of factor matrix 3 is {(n * p, rank)}. Actual shape is {factor_matrix_3.shape}."
+        )
+    # Form the matrix multiplication tensor <n, m, p>.
+    matmul_tensor = np.zeros((n * m, m * p, n * p), dtype=np.float32)
+    for i in range(n):
+        for j in range(m):
+            for k in range(p):
+                # Use the standard k*n+i indexing for the third dimension
+                matmul_tensor[i * m + j, j * p + k, k * n + i] = 1
+    # Check that the tensor is correctly constructed.
+    constructed_tensor = np.einsum("ir,jr,kr -> ijk", *decomposition)
+    # Exact check
+    if not np.array_equal(constructed_tensor, matmul_tensor):
+        # If the exact check fails, report the floating-point difference for diagnostics.
+        diff = np.max(np.abs(constructed_tensor - matmul_tensor))
+        raise ValueError(
+            f"Tensor constructed by decomposition does not exactly match the target tensor. Maximum difference is {diff:.6e}."
+        )
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            decomposition, n, m, p, loss, rank = program.run()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+        verify_tensor_decomposition(decomposition, n, m, p, rank)
+        success_threshold = 1e-6
+        if loss > success_threshold:
+            print(
+                f"\nWarning: Final loss {loss:.2e} is above the success threshold of {success_threshold:.2e}."
+            )
+        inverse_rank = BENCHMARK / rank
+        return {
+            "combined_score": inverse_rank,
+            "loss": loss,
+            "rank": rank,
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(evaluate)

benchmarks/math/matmul/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+numpy
+jax
+optax

benchmarks/math/matmul/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/math/matmul/initial_program.py ADDED Viewed

	@@ -0,0 +1,199 @@

+# Disable progress bar for cleaner output logs
+import os
+os.environ["TQDM_DISABLE"] = "1"
+# Fixed parameters
+n, m, p = 2, 4, 5
+# EVOLVE-BLOCK-START
+import numpy as np
+import jax
+import jax.numpy as jnp
+import optax
+from dataclasses import dataclass
+import tqdm
+# --- Straight-Through Estimator for Rounding ---
+@jax.custom_vjp
+def round_to_half_ste(x):
+    """Forward pass: snaps values to the nearest half-integer."""
+    return jnp.round(x * 2) / 2
+def round_ste_fwd(x):
+    """Standard forward pass and identity for backward pass."""
+    return round_to_half_ste(x), None
+def round_ste_bwd(res, g):
+    """Backward pass: Identity function, passes gradient straight through."""
+    return (g,)
+round_to_half_ste.defvjp(round_ste_fwd, round_ste_bwd)
+# --- End of STE definition ---
+# --- Loss Functions ---
+def weighted_l2_loss(reconstructed: jnp.ndarray, target: jnp.ndarray) -> jnp.ndarray:
+    error = reconstructed - target
+    weights = jnp.where(target != 0, 100.0, 1.0)
+    return jnp.mean(weights * (error**2))
+def l2_loss_real(x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
+    return jnp.mean((x - y) ** 2)
+# --- Hyperparameters ---
+@dataclass
+class Hyperparameters:
+    rank: int = 55
+    # Phase 1: Continuous Search
+    num_restarts: int = 10
+    phase1_steps: int = 80000
+    phase1_lr: float = 0.01
+    init_scale: float = 0.1
+    l1_strength: float = 1e-6
+    clamp_range: float = 4.0
+    # Phase 2: Discrete Fine-tuning
+    phase2_steps: int = 20000
+    phase2_lr: float = 1e-4  # A much smaller learning rate for fine-tuning
+# --- Optimizer Classes ---
+class ContinuousOptimizer:
+    """Finds a high-quality approximate continuous solution."""
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase1_lr)
+    def _get_constrained_decomposition(self, latent_decomposition: tuple) -> tuple:
+        """Applies a scaled tanh to map latent parameters to the desired range."""
+        return jax.tree_util.tree_map(
+            lambda x: self.hypers.clamp_range * jnp.tanh(x), latent_decomposition
+        )
+    def _loss_fn(self, latent_decomposition: tuple) -> jnp.ndarray:
+        constrained = self._get_constrained_decomposition(latent_decomposition)
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *constrained)
+        recon_loss = weighted_l2_loss(reconstructed, self.target_tensor)
+        l1_penalty = sum(jnp.mean(jnp.abs(arr)) for arr in constrained)
+        return recon_loss + self.hypers.l1_strength * l1_penalty
+class DiscreteOptimizer:
+    """Refines a continuous solution into an exact discrete one using an STE."""
+    def __init__(self, target_tensor: jnp.ndarray, hypers: Hyperparameters):
+        self.target_tensor = target_tensor
+        self.hypers = hypers
+        self.opt = optax.adam(hypers.phase2_lr)
+    def _loss_fn(self, continuous_decomposition: tuple) -> jnp.ndarray:
+        # Snap the continuous parameters to the discrete grid
+        discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_decomposition)
+        # Compute the loss using only these exact half-integer values
+        reconstructed = jnp.einsum("ir,jr,kr->ijk", *discrete_decomposition)
+        return l2_loss_real(reconstructed, self.target_tensor)
+# --- JIT-compatible Train Step ---
+def train_step(params, opt_state, optimizer, loss_fn):
+    loss, grads = jax.value_and_grad(loss_fn)(params)
+    updates, opt_state = optimizer.update(grads, opt_state, params)
+    params = optax.apply_updates(params, updates)
+    return params, opt_state, loss
+def get_matrix_multiplication_tensor(n, m, p):
+    T = jnp.zeros((n * m, m * p, n * p))
+    for i, j, k in np.ndindex(n, m, p):
+        T = T.at[i * m + j, j * p + k, k * n + i].set(1)
+    return T
+def run():
+    hypers = Hyperparameters()
+    target_tensor = get_matrix_multiplication_tensor(n, m, p)
+    main_key = jax.random.PRNGKey(42)
+    # --- PHASE 1: CONTINUOUS EXPLORATION ---
+    print(f"\n{'='*20} PHASE 1: Continuous Exploration {'='*20}")
+    best_loss_phase1 = float("inf")
+    best_latent_decomp = None
+    continuous_optimizer = ContinuousOptimizer(target_tensor, hypers)
+    # JIT the train_step for the continuous phase
+    jit_train_step_continuous = jax.jit(train_step, static_argnums=(2, 3))
+    for i in range(hypers.num_restarts):
+        print(f"\n--- Restart {i+1}/{hypers.num_restarts} ---")
+        main_key, restart_key = jax.random.split(main_key)
+        init_fn = jax.nn.initializers.normal(stddev=hypers.init_scale)
+        latent_decomp = (
+            init_fn(restart_key, (n * m, hypers.rank)),
+            init_fn(restart_key, (m * p, hypers.rank)),
+            init_fn(restart_key, (n * p, hypers.rank)),
+        )
+        opt_state = continuous_optimizer.opt.init(latent_decomp)
+        for _ in tqdm.tqdm(range(hypers.phase1_steps), desc="Continuous Search"):
+            latent_decomp, opt_state, loss = jit_train_step_continuous(
+                latent_decomp,
+                opt_state,
+                continuous_optimizer.opt,
+                continuous_optimizer._loss_fn,
+            )
+        final_loss = l2_loss_real(
+            target_tensor,
+            jnp.einsum(
+                "ir,jr,kr->ijk",
+                *continuous_optimizer._get_constrained_decomposition(latent_decomp),
+            ),
+        )
+        print(f"End of Trial | Final continuous loss: {final_loss:.8f}")
+        if final_loss < best_loss_phase1:
+            best_loss_phase1 = final_loss
+            best_latent_decomp = latent_decomp
+    # --- PHASE 2: DISCRETE FINE-TUNING ---
+    print(f"\n{'='*20} PHASE 2: Discrete Fine-tuning (STE) {'='*20}")
+    print(f"Starting with best continuous solution (loss: {best_loss_phase1:.8f})")
+    continuous_params = continuous_optimizer._get_constrained_decomposition(best_latent_decomp)
+    discrete_optimizer = DiscreteOptimizer(target_tensor, hypers)
+    opt_state = discrete_optimizer.opt.init(continuous_params)
+    # JIT the train_step for the discrete phase
+    jit_train_step_discrete = jax.jit(train_step, static_argnums=(2, 3))
+    for step in tqdm.tqdm(range(hypers.phase2_steps), desc="Discrete Fine-tuning"):
+        continuous_params, opt_state, loss = jit_train_step_discrete(
+            continuous_params, opt_state, discrete_optimizer.opt, discrete_optimizer._loss_fn
+        )
+        if (step + 1) % 2000 == 0:
+            print(f"Step {step+1} | Discrete Loss: {loss:.8f}")
+        if loss < 1e-7:
+            print("\nFound a perfect solution!")
+            break
+    final_discrete_decomposition = jax.tree_util.tree_map(round_to_half_ste, continuous_params)
+    final_loss = l2_loss_real(
+        target_tensor, jnp.einsum("ir,jr,kr->ijk", *final_discrete_decomposition)
+    )
+    print(f"Search complete. Final discrete loss: {final_loss:.8f}")
+    final_decomposition_np = jax.tree_util.tree_map(np.array, final_discrete_decomposition)
+    return final_decomposition_np, n, m, p, float(final_loss), hypers.rank
+# EVOLVE-BLOCK-END

benchmarks/math/minimizing_max_min_dist/2/config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# Math benchmark: minimizing_max_min_dist/2
+# Usage: skydiscover-run initial_program.py evaluator.py -c config.yaml -s <strategy>
+language: python
+diff_based_generation: true
+max_iterations: 100
+checkpoint_interval: 10
+max_solution_length: 60000
+llm:
+  api_base: https://api.openai.com/v1
+  models:
+    - name: "gpt-5"
+      weight: 1.0
+  max_tokens: 32000
+  timeout: 600
+prompt:
+  system_message: "SETTING:\nYou are an expert computational geometer and optimization specialist focusing on point dispersion\
+    \ problems.\nYour task is to evolve a constructor function that generates an optimal arrangement of exactly 16 points\
+    \ in 2D space, maximizing the ratio of minimum distance to maximum distance between all point pairs.\n\nPROBLEM CONTEXT:\n\
+    - Target: Beat the AlphaEvolve benchmark of min/max ratio = 1/√12.889266112 ≈ 0.2786\n- Constraint: Points must be placed\
+    \ in 2D Euclidean space (typically normalized to unit square [0,1] × [0,1])\n- Mathematical formulation: For points Pi\
+    \ = (xi, yi), i = 1,...,16:\n  * Distance matrix: dij = √[(xi-xj)² + (yi-yj)²] for all i≠j\n  * Minimum distance: dmin\
+    \ = min{dij : i≠j}\n  * Maximum distance: dmax = max{dij : i≠j}\n  * Objective: maximize dmin/dmax subject to spatial\
+    \ constraints\n\nPERFORMANCE METRICS:\n1. **min_max_ratio**: dmin/dmax ratio (PRIMARY OBJECTIVE - maximize)\n2. **combined_score**:\
+    \ min_max_ratio / 0.2786 (progress toward beating AlphaEvolve benchmark)\n3. **eval_time**: Execution time in seconds\
+    \ (balance accuracy vs. efficiency)\n\nTECHNICAL REQUIREMENTS:\n- **Reproducibility**: Fixed random seeds for all stochastic\
+    \ components\n"
+evaluator:
+  timeout: 360
+  max_retries: 3

benchmarks/math/minimizing_max_min_dist/2/evaluator/Dockerfile ADDED Viewed

	@@ -0,0 +1,13 @@

+FROM python:3.12-slim
+WORKDIR /benchmark
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# wrapper.py provides backwards compatibility for old Python-based evaluators
+# that define evaluate(program_path) -> dict.  Bridges them to the container
+# JSON protocol.  Source of truth: skydiscover/evaluation/wrapper.py
+COPY . .
+RUN chmod +x evaluate.sh
+ENTRYPOINT ["./evaluate.sh"]

benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluate.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/usr/bin/env bash
+set -euo pipefail
+PROGRAM="$1"
+# MODE ($2) accepted but ignored — override this file to use train/test splits.
+python /benchmark/evaluator.py "$PROGRAM"

benchmarks/math/minimizing_max_min_dist/2/evaluator/evaluator.py ADDED Viewed

	@@ -0,0 +1,78 @@

+# ===--------------------------------------------------------------------------------------===#
+#
+# This file implements the evaluator for problem of minimizing the ratio of maximum
+# to minimum distance on dimension 2 and with 16 points.
+#
+# ===--------------------------------------------------------------------------------------===#
+#
+# Some of the code in this file is adapted from:
+#
+# google-deepmind/alphaevolve_results:
+# Licensed under the Apache License v2.0.
+#
+# ===--------------------------------------------------------------------------------------===#
+import sys
+import os
+from importlib import __import__
+import scipy as sp
+import time
+import numpy as np
+NUM_POINTS = 16
+DIMENSION = 2
+BENCHMARK = 1 / 12.889266112
+# Scoring: (dmin/dmax)^2.
+# Key reformulation: maximize auxiliary variable t
+#   subject to d(i,j)^2 >= t AND d(i,j)^2 <= 1 for every pair (i,j).
+# This is a constrained NLP with O(n^2) pairwise inequality constraints.
+def evaluate(program_path: str):
+    try:
+        abs_program_path = os.path.abspath(program_path)
+        program_dir = os.path.dirname(abs_program_path)
+        module_name = os.path.splitext(os.path.basename(program_path))[0]
+        try:
+            sys.path.insert(0, program_dir)
+            program = __import__(module_name)
+            start_time = time.time()
+            points = program.min_max_dist_dim2_16()
+            end_time = time.time()
+            eval_time = end_time - start_time
+        except Exception as err:
+            raise err
+        finally:
+            if program_dir in sys.path:
+                sys.path.remove(program_dir)
+        if not isinstance(points, np.ndarray):
+            points = np.array(points)
+        if points.shape != (NUM_POINTS, DIMENSION):
+            raise ValueError(
+                f"Invalid shapes: points = {points.shape}, expected {(NUM_POINTS,DIMENSION)}"
+            )
+        pairwise_distances = sp.spatial.distance.pdist(points)
+        min_distance = np.min(pairwise_distances)
+        max_distance = np.max(pairwise_distances)
+        inv_ratio_squared = (min_distance / max_distance) ** 2 if max_distance > 0 else 0
+        return {
+            "min_max_ratio": float(inv_ratio_squared),
+            "combined_score": float(inv_ratio_squared / BENCHMARK),
+            "eval_time": float(eval_time),
+        }
+    except Exception as e:
+        return {"combined_score": 0.0, "error": str(e)}
+if __name__ == "__main__":
+    # Backwards-compat: bridges old evaluate() -> dict to the container JSON
+    # protocol.  wrapper.py is copied from skydiscover/evaluation/wrapper.py.
+    from wrapper import run
+    run(evaluate)

benchmarks/math/minimizing_max_min_dist/2/evaluator/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ numpy
2	+ scipy

benchmarks/math/minimizing_max_min_dist/2/evaluator/wrapper.py ADDED Viewed

	@@ -0,0 +1,98 @@

+"""Backwards-compat wrapper for old Python-based evaluators.
+Old-style evaluators define ``evaluate(program_path) -> dict``.  This module
+bridges that interface to the container JSON protocol expected by
+ContainerizedEvaluator.
+Usage — add this to the bottom of your evaluator.py::
+    if __name__ == "__main__":
+        from wrapper import run
+        run(evaluate)
+"""
+import json
+import sys
+import traceback
+def run(evaluate_fn):
+    """Call *evaluate_fn*, format the result as container-protocol JSON on stdout.
+    * Reads ``sys.argv[1]`` as the program path.
+    * Redirects stdout → stderr while *evaluate_fn* runs so that debug prints
+      don't contaminate the JSON output.
+    * Separates numeric metrics from non-numeric artifacts.
+    * Guarantees ``combined_score`` is always present in metrics.
+    """
+    if len(sys.argv) < 2:
+        print("Usage: evaluator.py <program_path>", file=sys.stderr)
+        sys.exit(1)
+    program_path = sys.argv[1]
+    # Redirect stdout → stderr during evaluation so debug prints from
+    # the evaluator don't contaminate the JSON output on stdout.
+    real_stdout = sys.stdout
+    sys.stdout = sys.stderr
+    try:
+        result = evaluate_fn(program_path)
+    except Exception as e:
+        sys.stdout = real_stdout
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": str(e),
+                        "traceback": traceback.format_exc(),
+                    },
+                }
+            )
+        )
+        return
+    sys.stdout = real_stdout
+    if not isinstance(result, dict):
+        print(
+            json.dumps(
+                {
+                    "status": "error",
+                    "combined_score": 0.0,
+                    "metrics": {"combined_score": 0.0},
+                    "artifacts": {
+                        "error": f"evaluate() returned {type(result).__name__}, expected dict"
+                    },
+                }
+            )
+        )
+        return
+    # Separate numeric metrics from non-numeric artifacts.
+    metrics = {}
+    artifacts = {}
+    for k, v in result.items():
+        if isinstance(v, bool):
+            metrics[k] = float(v)
+        elif isinstance(v, (int, float)):
+            metrics[k] = float(v)
+        elif isinstance(v, str):
+            artifacts[k] = v
+        elif isinstance(v, (list, dict)):
+            artifacts[k] = json.dumps(v)
+    if "combined_score" not in metrics:
+        metrics["combined_score"] = 0.0
+    status = "error" if "error" in artifacts else "success"
+    output = {
+        "status": status,
+        "combined_score": metrics["combined_score"],
+        "metrics": metrics,
+    }
+    if artifacts:
+        output["artifacts"] = artifacts
+    print(json.dumps(output))

benchmarks/math/minimizing_max_min_dist/2/initial_program.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# EVOLVE-BLOCK-START
+import numpy as np
+def min_max_dist_dim2_16() -> np.ndarray:
+    """
+    Creates 16 points in 2 dimensions in order to maximize the ratio of minimum to maximum distance.
+    Returns
+        points: np.ndarray of shape (16,2) containing the (x,y) coordinates of the 16 points.
+    """
+    n = 16
+    d = 2
+    # places points randomly
+    np.random.seed(42)
+    points = np.random.randn(n, d)
+    return points
+# EVOLVE-BLOCK-END